#! /usr/bin/gawk -f
# Last edited on 1999-01-04 17:59:07 by stolfi

# Creates a raw concordance for all words and short phrases in an
# interlinear file.
#
# Usage: 
#
#    cat TEXT \
#      | enum-text-phrases -f eva2erg.gawk \
#          [-v maxLength=MAXLEN] \
#          [-v leftContext=LCONT] \
#          [-v rightContext=RCONT] \
#      > OCCS 
#
# This script reads a single-version text from stdin (in plain or
# ".evt" interliner format), and, for every occurrence of every word
# or sufficiently short phrase in it, writes to stdout a record of the
# form
#
#   LOC TRANS START LENGTH LCTX PHRASE RCTX
#   1   2     3     4      5    6      7   
#   
# where 
#   
#   LOC        is a line locator, e.g. "f86v5.L5" or "f86v5.C2.12a"
#
#   TRANS      is a letter identifying a transcriber, e.g. "F" for Friedman
#
#   START      is the index of the phrase in the line (START=1 means col. 20).
# 
#   LENGTH     is the original length of the phrase in the text, 
#              including fillers, {}-comments, spaces, etc..
#              
#   LCTX       is a word or word sequence, the left context of PHRASE,
#
#   PHRASE     is the word or word sequence in question.
#
#   RCTX       is a word or word sequence, the right context of PHRASE
#
#
# The PHRASE does not contain any EVA fillers ([!% ]), newlines, or
# {}-comments.
#
# A word is a string of characters delimited by any of the EVA
# word-spaces [-=,.]. (Note that ASCII blanks are treated as fillers
# and not word breaks.)
#
# The PHRASE may extend across gaps and ordinary line breaks
# ("-") but not paragraph breaks ("=") or changes of textual unit. In
# that case the LENGTH field does not count the newline characters,
# columns 1-19, or any intervening "#"-comments.
#
# In both the phrase and the context strings, the uncertain spaces ","
# are replaced by ordinary spaces ".", The line nad paragraph breaks
# are preserved, except that line-final "-" is changed to "/" to
# distinguish it from embedded "-" denoting gaps, vellum defects, or
# intruding figures.
#
# A multi-word PHRASE is considered sufficiently short if contains at
# most MAXLENGTH EVA characters, not counting fillers and word spaces.
#
# The context fields LCTX and RCTX are writen only if the parameters
# "leftContext" and "rightContext" are specified.  If they are
# present, each of these two fields includes the word separator
# characters that delimit it on both sides.  The left context includes
# as many whole words as needed to make "leftContext" characters
# (including all delims after the first); and symmetrically for
# "rightContext".
#
# If the file is in ".evt" format, this script will get the LOC
# and TRANS fields from columns 1-19. The TRANS code is
# optional, but the unit must be explicit (i.e. there should be no
# "anonymous" units).  If the input file is not in ".evt" format, the
# script assumes the location code is f0.P.NNN;X where NNN is the
# input record number.

function gather_words(str, wp, wl, ws, \
  i, k, kb, m, b, c, bSep, cSep)
{
  # Stores in wp[i] the index of the first char 
  # of the ith non-empty word of "str".
  # Also stores in wl[i] its length, and in ws[i] the following 
  # separator (or "." if for some reason the separator is not found). 
  # Returns the number of words found.
  # Assumes "str" has been cleaned of comments.

  # Turn line-final "-" into "/":
  str = gensub(/[-]([!%]*)$/, "/\\1", " ", str);

  # Replace uncertain spaces by "."
  gsub(/[,]/, ".", str);

  # Pad with "." just for sentinels.
  str = ("." str ".");
  m = length(str); 
  n = 0;
  b = substr(str,1,1);
  if (b != ".") { error("internal padding error"); exit; }
  bSep = 1;
  for(k=2; k<=m; k++)
    { c = substr(str,k,1);
      cSep = (match(c, /[-/.,=]/) != 0);
      if (bSep && (! cSep)) { kb = k; }
      if ((! bSep) && cSep) 
        { n++; 
          wp[n] = kb - 1; 
          wl[n] = k-kb;
          ws[n] = c;
        }
      b = c; bSep = cSep;
    }
  if (c != ".") { error("internal padding error"); exit; }
  return n;
}

# === THE WORD BUFFER ===========================================

function clear_words()
{
  # Clears the word buffer:
  nbuf = 0;
  split("", wbuf); # wbuf[1..nbuf] are the saved words, squeezed.
  split("", wlen); # wlen[i] is length(wbuf[i]).
  split("", wloc); # wloc[i] the loc and trcode of line containing wbuf[i] (sep " ").
  split("", wiof); # wiof[i] is the index of wbuf[i]'s first char in the line.
  split("", wfof); # wfof[i] is the index of the first char after wbuf[i].
  split("", wsep); # wsep[i] is the word separator after wbuf[i].
  split("", wskp); # wskp[i] is the number of non-word bytes skipped after wbuf[i].
  wskp[0] = 0;   # Sentinel.
} 

function append_word(wd, sep, loc, iindex, findex)
{
  # Appends to the word buffer the word "wd" and following separator "sep",
  # with location "loc", original indices "[iindex..findex-1]"
  if (iindex < 1) { error("append_word: iindex error"); exit; }
  if (findex <= iindex) { error("append_word: findex error"); exit; }
  nbuf++;
  wbuf[nbuf] = wd;
  wlen[nbuf] = length(wd);
  wloc[nbuf] = loc;
  wiof[nbuf] = iindex;
  wfof[nbuf] = findex;
  wsep[nbuf] = sep;
  wskp[nbuf] = 0;
}

function append_filler(len)
{
  # Records that "len" filler bytes were skipped after the 
  # last word in the buffer
  if (len < 0) { error("append_filler: internal error"); exit; }
  wskp[nbuf] += len
}

function dump_phrases(   \
   i, j, k, len, locw, olen, ctx, wtmp)
{
  # Writes to stdout all words and short phrases currently in
  # the word buffer.  Also increments nWords.
  #
  for (i=1; i<=nbuf; i++)
    { nWords++;
      j = i;
      len = 0;
      off = wiof[i];
      olen = 0;
      locw = wloc[i];
      while((j <= nbuf) && ((j == i) || (len + wlen[j] <= maxLength)))
        { # Output one record for phrase "wbuf[i..j]":
          olen += wfof[j] - wiof[j];
          len += wlen[j];
          printf "%s %d %d", locw, off, olen; 
          if (leftContext >= 0)  { print_left_context(i, leftContext); }
          printf " %s", wbuf[i]; 
          for (k=i+1; k<=j; k++) { printf "%s%s", wsep[k-1],wbuf[k]; }
          if (rightContext >= 0) { print_right_context(j, rightContext); }
          printf "\n"; 
          olen += wskp[j];
          j++;
          nPhrases++;
        }
    }
  clear_words();
}

function print_left_context(i, minlen,   k, len)
{
  # Prints the words preceding "wbuf[i]", and their delimiters, 
  # with at least "minlen" total word characters.
  
  len = 0;
  k = i-1; 
  while((len < minlen) && (k >= 1)) { len += 1 + wlen[k]; k--; }
  printf " %s", (k < 1 ? "=" : wsep[k] );
  k++;
  while(k < i)
    { printf "%s%s", wbuf[k], wsep[k]; k++; }
}

function print_right_context(j, minlen,   k, len)
{
  # Prints the words following "wbuf[j]", and their delimiters, 
  # with at least "minlen" total word characters.
  
  len = 0;
  printf " %s", wsep[j];
  k = j+1; 
  while((len < minlen) && (k <= nbuf)) 
    { printf "%s%s", wbuf[k], wsep[k]; len += wlen[k] + 1; k++; }
}

# === ACTIONS ===================================================

BEGIN {
  abort = 0;
  
  if (maxLength == "") { maxLength = 0; }
  if (leftContext == "") { leftContext = -1; }
  if (rightContext == "") { rightContext = -1; }
  
  # Clears the word buffer:
  clear_words();
  
  # Count of words preceding the first one in the word buffer:
  nWords = 0;
  
  # Count of phrases written to stdout:
  nPhrases = 0;

  # Current page+unit and line:
  cur_unit = "";
  cur_line = "";
  cur_trans = "";
}

/^#/ { 
  if (abort) exit;
  next;
}

/./ {
  if (abort) exit;
  if (match($0, /^<[fc][0-9]+[vr][0-9]?[.]([A-Za-z][A-Za-z0-9]?[.]|[A-Z]|)[0-9]+[a-z]?([;][A-Z]|)>/)) 
    { loc = substr($0,2,RLENGTH-2);
      skip = 19;
    }
  else if (substr($0,1,1) == "<") 
    { error("bad location code"); }
  else 
    { loc = ("f0.P." NR ";X"); 
      skip = 0;
    }
  
  if (skip >= length($0)) next;
  
  # Analyze and regularize location code: 
  len = length(loc);
  if (substr(loc,len-1,1) != ";") 
    { trans = "X"; 
      # error("semicolon?");
    }
  else
    { trans = substr(loc,len,1); 
      loc = substr(loc, 1, len-2);
    }

  if (! match(loc, /[0-9]+[a-z]?$/))
    { error("prog error: no line num"); }
  else
    { line = substr(loc, RSTART); 
      unit = substr(loc, 1, RSTART-1);
    }
  
  if (trans == cur_trn)
    { }
  else if (cur_trn == "")
    { cur_trn = trans; }
  else
    { error("wrong transcriber code"); }
    
  loc = (loc " " trans);

  # Do not consider phrases that span more than one text unit:
  if (unit != cur_unit)
    { dump_phrases(); cur_unit = unit; }
  else if (line == cur_line)
    { error("repeated line"); }
  cur_line = line;
  
  # Get the text proper "txt", omitting skipped part, 
  # and replace comments and fillers by "!";
  txt = erg_erase_comments(substr($0,1+skip));
  
  # Replace four or more "*"s in the same word by "***":
  while(gsub(/[!][*]/, "*!", txt) > 0) { }
  while(sub(/[*][*][*][*]/, "!***", txt) > 0) { }
  
  # Extract the non-empty words from "txt":
  split("", wp); split("", wl); split("", ws);
  nw = gather_words(txt, wp, wl, ws);
  
  # Append the words to the word buffer:
  findex = 1;
  for (i=1; i<=nw; i++)
    { wd = erg_pack(substr(txt, wp[i], wl[i]));
      if (wd != "")
        { iindex = wp[i];
          append_filler(iindex - findex);
          findex = wp[i] + wl[i];
          # print i, wp[i], wl[i], iindex, findex > "/dev/stderr";
          append_word(wd, ws[i], loc, iindex, findex);
        }
    }
  append_filler(length(txt) - findex);

  # Phrases should not span paragraph boundaries:
  if (match(txt, /[=]/))
    { if (! match(substr(txt, RSTART+1), /[!% ]*$/))
        { error("embedded paragraph"); }
      dump_phrases();
    }

  next;
}

END {
  if(abort) exit;
  dump_phrases();
  
  printf "read  %6d words\n", nWords > "/dev/stderr";
  printf "wrote %6d phrases\n", nPhrases > "/dev/stderr";
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}