#! /usr/bin/gawk -f
# Last edited on 2002-01-03 22:40:49 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \
    "  [ -v maxRepeat=NUM ] \\\n" \
    "  [ -v minFreq=FRQ ] \\\n" \
    "  [ -v freqDigits=NUM ] \\\n" \
    "  [ -v showCounts=BOOL ] \\\n" \
    "  [ -v showFreqs=BOOL ] \\\n" \
    "  < INFILE.frq > OUTFILE.tex" \
  );

  # Tabulates given counts and/or frequencies of symbols 
  # and formats the output as a LaTeX table.
  # Assumes the input records have fields
  # 
  #   COUNT FREQ GLYPH{REPEAT} 
  # 
  # where GLYPH is an EVA string, already capitalized
  # REPEAT is an integer, and COUNT and FREQ are the 
  # statistics of maximal GLYPH^REPEAT in some sample.
  # The output is formatted as `ncols' columns, filled row-wise.
  # 
  # The output entries correspond to the glyphs listed in the
  # "elemList" string or in the "elemTable" file. In these lists,
  # a "-" glyph inserts an horizontal line.
  # This special glyph may occur multiple times.
  
  if (maxRepeat == "") { maxRepeat = 4; }
  if (showCounts == "") { showCounts = 1; }
  if (showFreqs == "") { showFreqs = 1; }
  if (minFreq == "") { minFreq = 0.00005; }
  if (freqDigits == "") { freqDigits = 4; }

  if ((elemList == "") == (elemTable == ""))
    { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); }
  split("", elem);
  split("", eindex);
  split("", eclass);
  if (elemList != "") 
    { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); }
  else
    { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); }

  # indexed with the capitalized element and the repeat count:
  split("", ect);
  split("", efr);
  split("", maxrep);
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 3) { data_error("bad line format"); }
  ct = $1; fr = $2; er = $3;
  nread++;
  if (! match(er, /^[A-Za-z?]+[{]/)) 
    { data_error(("bad elem/repeat format \"" er "\"")); }
  e = substr(er, 1, RLENGTH-1);
  rep = substr(er, RLENGTH);
  if (rep !~ /^[{][0-9]+[}]$/)
    { data_error(("bad repeat format \"" rep "\"")); }
  gsub(/[{}]/, "", rep);
  rep = rep + 0;
  if ((rep < 1) || (rep > maxRepeat)) 
    { data_error(("bad repeat value \"" rep "\"")); }
  if ((e,rep) in ect)
    { data_error(("repeated elem/repeat \"" er "\"")); }
  if (! (e in maxrep)) { maxrep[e] = 0; }
  if (rep > maxrep[e]) { maxrep[e] = rep; }
  if (ct !~ /^[0-9]+$/) { data_error(("bad count format \"" ct "\"")); }
  if (fr !~ /^[0-9]*[.][0-9]*$/) { data_error(("bad freq format \"" fr "\"")); }
  ect[e,rep] = ct;
  efr[e,rep] = fr;
  next;
}

END {
  if (abort >= 0) { exit abort; }
  print_elem_freqs_table();
}

function print_elem_freqs_table(   i,e,xe)
{
  printf "%% Created by tex-format-elem-rep-freqs\n";
  output_table_preamble();
  output_table_header();
  printf "nelems = %d:", nelems > "/dev/stderr";
  for (i = 1; (i <= nelems); i++)
    { e = elem[i];
      if (e == "-")
        { output_hline(); }
      else if (maxrep[e] > 1)
        { printf " %s", e > "/dev/stderr";
          output_elem_name(format_elem(e));
          output_elem_counts(e, ect, efr);
          printf " \\str\\\\\n";
        }
    }
  printf "\n" > "/dev/stderr";
  output_hline();
  printf "\\end{tabular}%%\n"; 
}

function output_table_preamble(   col,rep)
{
  printf "\\begin{tabular}{";
  printf "|c|";
  for (rep = 1; rep <= maxRepeat; rep++)
    { if (showCounts) { printf "r"; }
      if (showFreqs)  { printf "r"; }
      printf "|";
    }
  printf "}\n";
  output_hline();
}

function output_table_header(   nc,rep)
{
  nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0);
  # Prints column headers 
  printf "  \\hd{glyph}\n";
  for (rep = 1; rep <= maxRepeat; rep++)
    { printf "    & \\multicolumn{%d}{r|}{\\hd{%d}} \n", nc, rep; }
  
  printf "  \\\\\n";
  output_hline();
}

function output_hline()
{
  printf "  \\hline\n";
}

function output_elem_name(xe)
{
  printf "  "; 
  printf "%-10s ", xe;
}

function output_elem_counts(e,ect,efr,  rep,ct,fr,xct,xfr)
{
  for (rep = 1; rep <= maxRepeat; rep++)
    { ct = ect[e,rep]; xct = format_count(ct); 
      fr = efr[e,rep]; xfr = format_freq(fr);
      output_entry_count(xct, xfr);
    }
}

function output_entry_count(xct,xfr)
{
  if (showCounts) { printf "& %10s ", xct; }
  if (showFreqs) { printf "& %10s ", xfr; }
}

function format_elem(e)
{
  if (e == "+") 
    { return ("\\tot"); }
  else 
    { return ("\\ev{" e "}"); }
}

function format_count(ct)
{
  if (ct + 0 == 0) 
    { return "\\zeroct"; }
  else
    { return ("\\ct{" sprintf("%d", ct) "}"); }
}

function format_freq(fr)
{ 
  if (fr + 0 < minFreq) 
    { return "\\zerofr"; }
  else
    { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr);
      if (fr >= 1.0) 
        { fr = substr(fr, 1, freqDigits+1); }
      else
        { fr = substr(fr,2, freqDigits+1); }
      return ("\\fr{" fr "}");
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}