#! /usr/bin/gawk -f
# Must specify -f eva2erg.gawk

# Creates a raw concordance for all words and short phrases in 
# a label/title dictionary
#
# Usage: 
#
#    cat TEXT \
#      | enum-label-phrases -f eva2erg.gawk \
#          [-v maxlen=MAXLEN] \
#      > OCCS 
#
# This script reads from stdin a database of labels and titles,
# in the ".idx" format (see Note-010.html). Each input record should
# have the form 
#
#   LOCATION|LABEL|CLASS|MEANING|SECTION|COMMENTS
#   1        2     3     4       5       6
# 
# and, for occurrence of every word or
# sufficiently short phrase in it, writes to stdout a record of the
# form
#
#   FNUM UNIT LINE TRANS START LENGTH POS STRING OBS
#   1    2    3    4     5     6      7   8      10
#   
# where FNUM, UNIT, LINE and TRANS are the components of the LOCATION,
# and the OBS field is a join of the SECTION and MEANING fields.  The
# POS field is set to zero.  The other fields are as defined in
# enum-text-phrases, except that the output STRINGs will always be
# confined to a single input line.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

function gather_words(str, wd, wo, wl, \
  i, k, kb, m, b, c, ww)
{
  # Stores in wd[i] the ith non-empty word of "str",
  # Also set wo[i] to its original index, 
  # and wl[i] to its original length. 
  # Returns the number of words found.
  str = erg_unify_word_spaces(erg_erase_comments(str));
  str = ("." str ".");
  m = length(str); 
  n = 0;
  b = substr(str,1,1);
  for(k=2; k<=m; k++)
    { c = substr(str,k,1);
      if ((b == ".") && (c != ".")) { kb = k; }
      if ((b != ".") && (c == ".")) 
        { ww = erg_pack(substr(str, kb, k-kb));
          if (ww != "") 
            { n++; 
              wd[n] = ww;
              wo[n] = kb - 1;
              wl[n] = k - kb;
            }
        }
      b = c;
    }
  return n;
}

function dump_phrases(lc, wd, wo, wl, nw, ob,   \
   i, j, k, len, locf, olen, pos)
{
  # Writes to stdout all words and short phrases 
  # formed from words "wd[1..nw]", given their
  # original indices "wo[1..nw]" and lengths "wl[1..nw]".
  # Prints them with location "lc", and observations "ob". 
  gsub(/[.;]/, " ", lc);
  pos = 0;
  for (i=1; i<=nw; i++)
    { nWords++;
      j = i;
      len = 0;
      olen = 0;
      while((j <= nw) && ((j == i) || (len + length(wd[j]) <= maxlen)))
        { len += length(wd[j]);
          olen = (wo[j] - wo[i]) + wl[j];
          printf "%s %d %d %d %s", lc, wo[i], olen, pos, wd[i]; 
          for (k=i+1; k<=j; k++) { printf ".%s", wd[k]; }
          printf " %s\n", (ob == "" ? "-" : ob);
          j++;
          nPhrases++;
        }
    }
}

# === ACTIONS ===================================================

BEGIN {
  abort = 0;
  FS = "|";
  nPhrases = 0;
  nWords = 0;
}

/^#/ { 
  if (abort) exit;
  next;
}

/./ {
  if (abort) exit;
  location = $1;
  text = $2;
  class = $3;
  meaning = gensub(/ /, "_", "g", $4);
  section = $5;
  split("", words);
  split("", indices);
  split("", lengths);
  obs = (meaning == "?" ? section : (section ":" meaning));
  nw = gather_words(text, words, indices, lengths);
  dump_phrases(location, words, indices, lengths, nw, obs); 
  next;
}