#! /usr/bin/gawk -f
# Last edited on 2001-01-18 02:38:46 by stolfi

BEGIN {
  abort = -1;
  usage = ( "count-ngrams -f parse_elem_list.gawk \\\n" \
    "  [ -v maxOrder=MAXORDER ] \\\n" \
    "  [ -v elemList='EL1,EL2,...ELN' ] \\\n" \
    "  [ -v spaceMarker=STRING ] \\\n" \
    "  [ -v showBadWords=BOOL ] \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # Assumes the input is a text, one word per line, is plain EVA, with
  # capitalized ligatures and elements marked off by {}. The program
  # inserts a `spaceMarker' element in braces at every word boundary
  # (including the beginning and end of file), and then writes out
  # several records of the form
  # 
  #   COUNT NGRAM
  # 
  # where the NGRAM is a string of from 1 to MAXORDER consecutive elements
  # from the text.
  # 
  # One record is written for each NGRAM that does not overlap an
  # invalid word. A word is considered valid only if all its elements are in
  # the specified list.
  
  if (maxOrder == "") { arg_error("must specify \"maxOrder\""); }
  if (spaceMarker == "") { spaceMarker = "_"; }
  spaceMarkerWBraces = ( "{" spaceMarker "}" );
  if (showBadWords == "") { showBadWords = 0; }
  
  if ((elemList == "") == (elemTable == ""))
    { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); }
  split("", elem);
  split("", elemIndex);
  split("", elemClass);
  if (elemList != "") 
    { nelems = parse_explicit_elems(elemList,elem,elemIndex,elemClass); }
  else
    { nelems = load_elems_from_file(elemTable,elem,elemIndex,elemClass); }

  # indexed with the capitalized and bracketed word ngram:
  split("", ngramCt);

  ntoken = 0; ntokenBad = 0;
  
  # The last valid characters that were processed are `buf[ibuf + k]'
  # for `k = 0..nbuf-1', where indices are taken modulo `maxOrder'
  # and `nbuf' is strictly less than `maxOrder'.
  split("", buf);
  flush_buffer();

  # Account for the inital space:
}

function flush_buffer()
{ 
  # Called at the beginning of file, or wherever a bad word is found.
  # Empties the buffer and pretends a word space was just seen.
  ibuf = 0;
  if (maxOrder > 1)
    { buf[0] = spaceMarkerWBraces; nbuf = 1; }
  else
    { nbuf = 0; }
  ngramCt[spaceMarkerWBraces]++;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  ntoken++;
  if (NF != 1) { data_error("bad line format"); }
  token = $1;
  if (token !~ /^[{}a-zA-Z0-9?]+$/) { data_error("bad word"); }
  tk = token;
 
  # split word into elements:
  gsub(/[}][{]/, "} {", tk);
  ne = split(tk, tkel, " ");
  
  # Append a space to the word:
  tkel[ne+1] = spaceMarkerWBraces;
    
  # Element consistency check:
  tokenIsBad = 0;
  for (i = 1; i <= ne; i++) 
    { e = tkel[i];
      if (e !~ /^[{][a-zA-Z0-9?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); }
      gsub(/[{}]/, "", e);
      if (! (e in elemIndex)) { tokenIsBad = 1; } 
    }

  if (tokenIsBad) 
    { if (showBadWords) { printf "! %s\n", token > "/dev/stderr"; }
      flush_buffer();
      ntokenBad++;
    }
  else
    { # Enumerate ngrams that end in this word or its final space:
      for (i = 1; i <= ne+1; i++) 
        { push_elem_and_tally_ngrams(tkel[i]); }
    }
  next;
}

function push_elem_and_tally_ngrams(ei,    j,g,r)
{
  # Add new element `ei' to buffer and enumerate ngrams that end with it.
  j = (ibuf + nbuf) % maxOrder;
  buf[j] = ei; nbuf++;
  if (nbuf > maxOrder) { prog_error("bad nbuf"); }
  g = "";
  for (r = 0; r < nbuf; r++)
    { er = buf[(j + maxOrder - r) % maxOrder];
      g = (er g);
      # printf "%s\n", g > "/dev/stderr";
      ngramCt[g]++;
    }
  if (nbuf == maxOrder) { ibuf = (ibuf + 1) % maxOrder; nbuf--; }
}

END {
  if (abort >= 0) { exit abort; }
  printf "%d tokens read\n", ntoken > "/dev/stderr";
  printf "%d bad tokens found\n", ntokenBad > "/dev/stderr";

  for (g in ngramCt)
    { ct = ngramCt[g];
      printf "%7d %s\n", ct, g;
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function prog_error(msg)
{ 
  printf "***prog error: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}