#! /usr/bin/gawk -f
# Last edited on 2002-01-03 22:22:38 by stolfi

BEGIN {
  abort = -1;
  usage = ( "tex-format-elem-tw-freqs \\\n" \
    "  { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \
    "  [ -v ncols=NUM ] \\\n" \
    "  [ -v minFreq=FRQ ] \\\n" \
    "  [ -v freqDigits=NUM ] \\\n" \
    "  [ -v showClasses=BOOL ] \\\n" \
    "  [ -v showCounts=BOOL ] \\\n" \
    "  [ -v showFreqs=BOOL ] \\\n" \
    "  < INFILE.jfr > OUTFILE.tex" \
  );

  # Tabulates given counts and/or frequencies of symbols 
  # and formats the output as a LaTeX table.
  # Assumes the input records have fields
  # 
  #   TCOUNT TFREQ WCOUNT WFREQ GLYPH 
  # 
  # where GLYPH is an EVA string, already capitalized;
  # TCOUNT and TFREQ are the statistics of GLYPH among the tokens;
  # and WCOUNT and WFREQ are the statistics among the words.
  # The output is formatted as `ncols' columns, filled row-wise.
  # 
  # The output entries correspond to the glyphs listed in the
  # "elemList" string or in the "elemTable" file. In these lists,
  #   if GLYPH = "~", the entry is left blank.
  #   if GLYPH = "/", the current row is padded with blanks.
  #   if GLYPH = "-", does the same, then inserts an horizontal line.
  # These special glyphs may occur multiple times.
  # 
  # If showClasses is TRUE, also prints the element's class
  # at the leftmost column.  This option is effective only
  # when the elements are read from a file.
  
  if (ncols == "") { ncols = 2; }
  if (showCounts == "") { showCounts = 1; }
  if (showFreqs == "") { showFreqs = 1; }
  if (showClasses == "") { showClasses = ( elemTable != ""); }
  if (minFreq == "") { minFreq = 0.00005; }
  if (freqDigits == "") { freqDigits = 4; }

  if ((elemList == "") == (elemTable == ""))
    { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); }
  split("", elem);
  split("", eindex);
  split("", eclass);
  if (elemList != "") 
    { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); }
  else
    { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); }

  if (showClasses && (! hasclass))
    { arg_error("there are no classes to show"); }

  # Element counts and freqs, for tokens and words.
  # Indexed with the capitalized element itself.
  split("", etCt); split("", ewCt);
  split("", etFr); split("", ewFr);
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 5) { data_error("bad line format"); }
  tCt = $1; tFr = $2; wCt = $3; wFr = $4; e = $5;
  nread++;
  if (e !~ /^[A-Za-z?]+$/) { data_error(("bad elem \"" e "\"")); }
  if (e in etCt) { data_error(("repeated elem \"" e "\"")); }
  etCt[e] = tCt; etFr[e] = tFr;
  ewCt[e] = wCt; ewFr[e] = wFr;
  next;
}

END {
  if (abort >= 0) { exit abort; }
  print_elem_freqs_table();
}

function print_elem_freqs_table(  \
    i,col,row,oldrow,hline,cline,e,tCt,tFr,wCt,wFr,\
    cl,oldcl,xe,xtCt,xtFr,xwCt,xwFr,xcl \
)
{
  printf "%% Created by %s\n", ARGV[0];
  
  # Table preamble
  output_table_preamble();

  row = 0;

  # Column headers
  end_row(row, 1, 0);
  output_table_header();
  row++;
  
  # Table entries:
  oldcl = "";
  col = ncols+1;
  hline = 1;
  cline = 0;

  printf "nelems = %d:", nelems > "/dev/stderr";
  for (i = 1; ((i <= nelems) || (col <= ncols)); i++)
    { # Assert: col > 1.
      e = (i <= nelems ? elem[i] : "~");
      printf " %s", e > "/dev/stderr";
      # Obtain element data:
      if (e ~ /^[-\/]$/)
        { while (col <= ncols) { output_entry(col, "", "", "", "", ""); col++; }
          if (e == "-") { cline = ncols; }
        }
      else 
        { if (e == "~")
            { cl = oldcl; 
              tCt = 0; tFr = 0; wCt = 0; wFr = 0;
              xe = ""; xcl = "";
              xtCt = ""; xtFr = ""; xwCt = ""; xwFr = "";
            }
          else
            { cl = eclass[i]; 
              tCt = etCt[e]; tFr = etFr[e]; 
              wCt = ewCt[e]; wFr = ewFr[e]; 
              # Format values
              if (showClasses && (cl != oldcl))
                { xcl = ("\\cl{" cl "}");
                  while (col <= ncols) { output_entry(col, "", "", "", "", ""); col++; }
                  hline = 1; cline = 0;
                }
              else
                { xcl = ""; }
              xe = format_elem(e);
              xtCt = format_count(tCt); xwCt = format_count(wCt); 
              xtFr = format_freq(tFr);  xwFr = format_freq(wFr); 
            }
          
          # Print element entry
          if (col > ncols)
            { end_row(row, hline, cline);
              row++; col = 1; hline = 0; cline = 0;
            }
          # Assert: col <= ncols
          output_entry(col, xe, xtCt, xtFr, xwCt, xwFr, xcl); col++;
          oldcl = cl;
        }
    }
  printf "\n" > "/dev/stderr";
  end_row(row, 1, 0);
  printf "\\end{tabular}%%\n"; 
}

function output_table_preamble(   col,nc)
{
  nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0)
  printf "\\begin{tabular}{";
  if (showClasses) { printf "|c"; }
  for (col = 1; col <= ncols; col++)
    { 
      printf "|c|";
      if (nc > 0) 
        { 
          if (showCounts) { printf "r"; }
          if (showFreqs)  { printf "r"; }
          printf "|";
          if (showCounts) { printf "r"; }
          if (showFreqs)  { printf "r"; }
          printf "|";
        }
    }
  printf "}\n";
}

function output_table_header(   col,nc,xb)
{
  nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0);
  if (showClasses) { printf "\\hd{class} &\n  "; }
  for (col = 1; col <= ncols; col++)
    { 
      if (col != 1) { printf "&\n"; }
      printf "  "; 
      printf "\\hd{glyph} ";
      if (nc > 0) { 
        xb = ( col < ncols ? "|" : "");
        printf "& \\multicolumn{%d}{c|}{\\hd{text}}", nc;
        printf "& \\multicolumn{%d}{c|%s}{\\hd{lexicon}}", nc, xb;
      }
    }
}

function format_elem(e)
{
  if (e == "+") 
    { return ("\\tot"); }
  else 
    { return ("\\ev{" e "}"); }
}

function format_count(ct)
{
  if (ct + 0 == 0) 
    { return "\\zeroct"; }
  else
    { return ("\\ct{" sprintf("%d", ct) "}"); }
}

function format_freq(fr)
{ 
  if (fr + 0 < minFreq) 
    { return "\\zerofr"; }
  else
    { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr);
      if (fr >= 1.0) 
        { fr = substr(fr, 1, freqDigits+1); }
      else
        { fr = substr(fr,2, freqDigits+1); }
      return ("\\fr{" fr "}");
    }
}

function end_row(row,hline,cline,   fcol)
{
  if (row > 0) { printf "\\str\\\\\n"; }
  if (hline) 
    { printf "  \\hline\n"; }
  else if (cline > 0)
    { fcol = 1 + (showClasses ? 1 : 0 );
      printf "  \\cline{%d-%d}\n", fcol, fcol + cline - 1;
    }
}

function output_entry(col,xe,xtCt,xtFr,xwCt,xwFr,xcl)
{
  if (col != 1) { printf "&\n"; }
  printf "  "; 
  if ((col == 1) && showClasses) { printf "%10s &\n  ", xcl; }
  printf "%-10s ", xe;
  if (showCounts) { printf "& %10s ", xtCt; }
  if (showFreqs) { printf "& %10s ", xtFr; }
  if (showCounts) { printf "& %10s ", xwCt; }
  if (showFreqs) { printf "& %10s ", xwFr; }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}