#! /usr/bin/gawk -f
# Last edited on 2001-12-29 01:29:49 by stolfi

BEGIN {
  abort = -1;
  usage = ( "count-elem-pairs \\\n" \
    "  { -v rowList='a,o,...' | -v rowTable=FILE } \\\n" \
    "  { -v colList='a,o,...' | -v colTable=FILE } \\\n" \
    "  [ -v endMarker=STRING ] \\\n" \
    "  [ -v showBadWords=BOOL ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Assumes the input records have fields
  # 
  #   COUNT WORD 
  # 
  # where WORD is plain EVA, with capitalized ligatures and elements
  # marked off by {}; and COUNT is its token count. Outputs the
  # element pair counts in those tokens, in the format
  # 
  #   COUNT ELEM1:ELEM2
  # 
  # The lists of valid elements for ELEM1 and ELEM2 
  # are specified either directly, through the
  # `rowList' and `colList' parameters, or through files named by
  # the `rowTable' abd `colTable' 
  # parameters. In the first case the elements should be separated by
  # commas. In the second case, each element must be the first field
  # in a separate line ("#" lines excluded). In either case, the
  # elements must be capitalized as in the input, without braces. 
  # 
  # The "endMarker" string (default "_") is implicitly prefixed and postfixed to
  # every word, and must appear in the element list. 
  #
  # The special elements "/" "~" "+" and "-" are ignored; they
  # may appear multiple times in the element list.
  # 
  # The output will contain only the pairs where both elements 
  # are in the list, in the specified sequence.
  
  if (endMarker == "") { endMarker = "_"; }
  if (showBadWords == "") { showBadWords = 0; }
  
  # indexed with the capitalized element itself:
  split("", ect);
  
  # indexed with the capitalized element pair:
  split("", eect);

  if ((rowList == "") == (rowTable == ""))
    { arg_error("must define exactly one of \"rowList\" and \"rowTable\""); }
  split("", rowElem);
  split("", rowIndex);
  split("", rowClass);
  if (rowList != "") 
    { nrowElems = parse_explicit_elems(rowList,rowElem,rowIndex,rowClass); }
  else
    { nrowElems = load_elems_from_file(rowTable,rowElem,rowIndex,rowClass); }

  if ((colList == "") == (colTable == ""))
    { arg_error("must define exactly one of \"colList\" and \"colTable\""); }
  split("", colElem);
  split("", colIndex);
  split("", colClass);
  if (colList != "") 
    { ncolElems = parse_explicit_elems(colList,colElem,colIndex,colClass); }
  else
    { ncolElems = load_elems_from_file(colTable,colElem,colIndex,colClass); }

  split("", pairCt);
  split("", rowCt);
  split("", colCt);
  
  nrowGud = 0; ncolGud = 0; 
  nrowBad = 0; ncolBad = 0; 
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 2) { data_error("bad line format"); }
  ct = $1; w = $2;
  if (w !~ /^[{}a-zA-Z?]+$/) { data_error(("bad word \"" w "\"")); }
  
  # split word into elements:
  gsub(/[}][{]/, "} {", w);
  ne = split(w, welem, " ");
  prev = endMarker;
  for (i = 1; i <= ne; i++) 
    { e = welem[i];
      if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("bad elem \"" e "\"")); }
      gsub(/[{}]/, "", e);
      tally_pair(prev, e, ct, $2);
      prev = e;
    }
  tally_pair(prev,endMarker,ct,$2);
  next;
}

function tally_pair(ei,ej,ct,worig)
{
  if (! (ei in rowCt)) { rowCt[ei] = 0; }
  rowCt[ei] += ct; 
  if (ei in rowIndex)
      { nrowGud += ct; }
    else
      { nrowBad += ct; 
        if (showBadWords && (ei !~ /[?]/)) 
          { printf " %5d %-5s %s\n", ct, (ei ":"), $2 > "/dev/stderr"; }
      }
  if (! (ej in colCt)) { colCt[ej] = 0; }
  colCt[ej] += ct; 
  if (ej in colIndex)
      { ncolGud += ct; }
    else
      { ncolBad += ct; 
        if (showBadWords && (ej !~ /[?]/)) 
          { printf " %5d %-5s %s\n", ct, (":" ej), $2 > "/dev/stderr"; }
      }
  pairCt[ei,ej] += ct;
}

END {
  if (abort >= 0) { exit abort; }
  if (nrowBad > 0)
    { printf "extraneous row elems found:\n" > "/dev/stderr";
      for (ei in rowCt)
        { if (! (ei in rowIndex))
            { printf "  %-5s %7d\n", ei, rowCt[ei] > "/dev/stderr"; }
        } 
    }
  if (ncolBad > 0)
    { printf "extraneous col elems found:\n" > "/dev/stderr";
      for (ej in colCt)
        { if (! (ej in colIndex))
            { printf "  %-5s %7d\n", ej, colCt[ej] > "/dev/stderr"; }
        } 
    }
  for (i = 1; i <= nrowElems; i++)
    { ei = rowElem[i]; 
      if (ei !~ /^[-+~\/]$/) 
        { for (j = 1; j <= ncolElems; j++) 
            { ej = colElem[j];
              if (ej !~ /^[-+~\/]$/) 
                { ct = pairCt[ei,ej];
                  printf "%7d %s:%s\n", ct, ei,ej;
                }
            }
        }
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function table_error(msg)
{ 
  printf "error in elemsTable: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}