#! /usr/bin/gawk -f
# Last edited on 2001-12-28 21:02:27 by stolfi

BEGIN {
  abort = -1;
  usage = ( "count_elems.gawk" " \\\n" \
    "  { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \
    "  [ -v showBadWords=BOOL ] \\\n" \
    "  [ -v joinRepeats=BOOL ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Assumes the input records have fields
  # 
  #   COUNT WORD 
  # 
  # where WORD is plain EVA, with capitalized ligatures and elements
  # marked off by {}; and COUNT is its token count. 
  # 
  # If "joinRepeats" is false, outputs a set of lines in the format
  # 
  #   TOTCOUNT ELEM
  # 
  # where ELEM is an element (without the braces), and TOTCOUNT is the
  # total number of occurrences of ELEM in all input words, multiplied
  # by the respective COUNTs.
  # 
  # If "joinRepeats" is true, counts each maximal element repeat
  # as a distinct element: so "{a}{b}{b}{a}{c}{c}{c}" has 4 maximal
  # repeats, "{a}", "{b}{b}", "{a}", and "{c}{c}{c}".
  # In that case, the repeat count, in braces, is appended 
  # to the ELEM field.
  # 
  # The list of elements is specified either directly, through the
  # `elemList' parameter, or through a file named in the `elemTable'
  # parameter. In the first case the elements should be separated by
  # commas. In the second case, each element must be the first field
  # in a separate line ("#" lines excluded). In either case, the
  # elements must be capitalized as in the input, without braces. 
  # 
  # The special elements "~", "/", "+", "-" may occur multiple
  # times in the list, and are ignored.
  #
  # The output will contain only the specified elements, in the
  # specified sequence.
  
  if (showBadWords == "") { showBadWords = 0; }
  if (joinRepeats == "") { joinRepeats = 0; }

  if ((elemList == "") == (elemTable == ""))
    { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); }
  split("", elem);
  split("", eindex);
  split("", eclass);
  if (elemList != "") 
    { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); }
  else
    { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); }

  # indexed with the capitalized element itself (plus repeat count):
  split("", ect);
  if (joinRepeats) { split("", maxrep); }
  ngud = 0;
  nbad = 0;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 2) { data_error("bad line format"); }
  ct = $1; w = $2;
  if (w !~ /^[{}a-zA-Z?]+$/) { data_error("bad word"); }
  
  # split word into elements:
  gsub(/[}][{]/, "} {", w);
  ne = split(w, welem, " ");
  rep = 0;
  for (i = 1; i <= ne; i++) 
    { e = welem[i];
      if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("bad elem \"" e "\"")); }
      gsub(/[{}]/, "", e);
      if (e in eindex)
        { ngud += ct; }
      else
        { nbad += ct; 
          if (showBadWords && (e !~ /[?]/)) 
            { printf " %5d %-5s %s\n", ct, e, $2 > "/dev/stderr"; }
        }
      if (joinRepeats) 
        { if (! (e in maxrep)) { maxrep[e] = 0; }
          rep++;
          if ((i == ne) || (welem[i+1] != welem[i]))
            { er = ( e "{" rep "}" );
              if (! (er in ect)) { ect[er] = 0; }
              ect[er] += ct;
              if (rep > maxrep[e]) { maxrep[e] = rep; }
              rep = 0;
            }
        }
      else
        { if (! (e in ect)) { ect[e] = 0; }
          ect[e] += ct;
        }
    }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  printf "%d valid elems found.\n", ngud > "/dev/stderr";
  if (nbad > 0)
    { printf "extraneous elems found:\n" > "/dev/stderr";
      for (e in ect)
        { if (! (e in eindex))
            { printf "  %-5s %5d\n", e, ect[e] > "/dev/stderr"; }
        } 
    }
  for (i = 1; i <= nelems; i++)
    { 
      e = elem[i]; 
      if (e !~ /^[-~+\/]$/) 
        { 
          if (joinRepeats)
            { for (rep = 1; rep <= maxrep[e]; rep++)
                { er = ( e "{" rep "}" );
                  ct = ect[er];
                  printf "%7d %s\n", ct, er;
                }
            }
          else
            { ct = ect[e];
              printf "%7d %s\n", ct, e;
            }
        }
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function table_error(msg)
{ 
  printf "error in elemsTable: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}