#! /usr/bin/gawk -f
# Last edited on 2002-01-03 22:25:32 by stolfi

BEGIN {
  abort = -1;
  usage = ( "tex-format-elem-pair-freqs \\\n" \
    "  { -v rowList='a,o,...' | -v rowTable=FILE } \\\n" \
    "  { -v colList='a,o,...' | -v colTable=FILE } \\\n" \
    "  [ -v elemPrefix=STRING ] \\\n" \
    "  [ -v endMarker=STRING ] \\\n" \
    "  [ -v minFreq=FRQ ] \\\n" \
    "  [ -v freqDigits=NUM ] \\\n" \
    "  [ -v showCounts=BOOL ] \\\n" \
    "  [ -v showRowFreqs=BOOL ] \\\n" \
    "  [ -v showColFreqs=BOOL ] \\\n" \
    "  < INFILE.gpf > OUTFILE.tex" \
  );

  # Tabulates given counts and/or frequencies of symbols 
  # and formats the output as a LaTeX table.
  # Assumes the input records have fields
  # 
  #   COUNT ROWFREQ COLFREQ TOTFREQ ELEM1:ELEM2
  # 
  # where ELEM1 and ELEM2 are EVA strings, already capitalized.
  # As a special case either elem or both can be "+", meaning a row,
  # column, or table total.
  # 
  # The output rows and columns correspond to the elems listed in the
  # "rowList" and "colList strings or in the "rowTable" and "colTable"
  # files.
  # 
  # In the element lists, the special elems "/" and "~" are ignored.
  # The special elem "-" inserts a line at the corresponding row or
  # column. The "endMarker" elem is mapped to \remk or \cemk, the "+"
  # elem is mapped to \rtot or \ctot. Other elems are
  # mapped to \rev{ELEM1} or \cev{ELEM2}, with the optional "elemPrefix"
  # prepended to ELEM1 or ELEM2. If a frequency is less than
  # minFreq it is mapped to \zerofr, otherwise to \fr{FREQ}.
  # 
  
  if (endMarker == "") { endMarker = "_"; }
  if (showCounts == "") { showCounts = 1; }
  if (showRowFreqs == "") { showRowFreqs = 1; }
  if (showColFreqs == "") { showColFreqs = 1; }
  if (minFreq == "") { minFreq = 0.005; }
  if (freqDigits == "") { freqDigits = 2; }

  if ((rowList == "") == (rowTable == ""))
    { arg_error("must define exactly one of \"rowList\" and \"rowTable\""); }
  split("", rowElem);
  split("", rowIndex);
  split("", rowClass);
  if (rowList != "") 
    { nrowElems = parse_explicit_elems(rowList,rowElem,rowIndex,rowClass); }
  else
    { nrowElems = load_elems_from_file(rowTable,rowElem,rowIndex,rowClass); }

  if ((colList == "") == (colTable == ""))
    { arg_error("must define exactly one of \"colList\" and \"colTable\""); }
  split("", colElem);
  split("", colIndex);
  split("", colClass);
  if (colList != "") 
    { ncolElems = parse_explicit_elems(colList,colElem,colIndex,colClass); }
  else
    { ncolElems = load_elems_from_file(colTable,colElem,colIndex,colClass); }

  # Pair data tables
  # indexed with the capitalized element pair:
  split("", pairCt);
  split("", rowFreq);
  split("", colFreq);
  npairs = 0;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 5) { data_error("bad line format"); }
  ct = $1; rfr = $2; cfr = $3; totfr = $4; pr = $5;
  npairs++;
  
  nitems = split(pr, item, ":");
  if (nitems != 2) { data_error("bad item format"); }
  for (k = 1; k <= 2; k++) 
    { it = item[k];
      if ((it !~ /^[+]$/) && (it !~ /^[_A-Za-z?]+$/))
        { data_error("bad elem"); }
    }
  ei = item[1]; ej = item[2];
  pairCt[ei,ej] = ct;
  rowFreq[ei,ej] = rfr;
  colFreq[ei,ej] = cfr;
  next;
}

END {
  if (abort >= 0) { exit abort; }
  print_elem_pair_freqs_table();
}

function print_elem_pair_freqs_table(  \
  i,col,row,ei,ej,ct,rfr,cfr,xei,xej,xct,xrfr,xcfr \
)
{
  printf "%% Created by tex-format-elem-pair-freqs\n";
  
  # Table preabmble
  printf "\\begin{tabular}{|c|";
  for (col = 1; col <= ncolElems; col++)
    { ej = colElem[col];
      if (ej == "-") 
        { printf "|"; }
      else if (ej !~ /^[~\/]$/)
        { if (showCounts) { printf "r"; }
          if (showRowFreqs) { printf "r"; }
          if (showColFreqs) { printf "r"; }
        }
    }
  printf "|}\n";
  printf "  \\hline\n";
  
  print_table_header();
  
  # Table entries:
  for (row = 1; row <= nrowElems; row++)
    { ei = rowElem[row];
      if (ei == "-")
        { printf "  \\hline\n"; }
      else if (ei !~ /^[~\/]$/)
        { printf "  ";
          xei = format_elem(ei,"r");
          printf "%-10s\n    ", xei;
          for (col = 1; col <= ncolElems; col++)
            { ej = colElem[col];
              if (ej !~ /^[-~\/]$/)
                { ct = pairCt[ei,ej]; 
                  rfr = rowFreq[ei,ej]; cfr = colFreq[ei,ej];
                  xct = format_count(ct); 
                  xrfr = format_freq(rfr);
                  xcfr = format_freq(cfr);
                  if (showCounts)   { printf "& %10s ", xct; }
                  if (showRowFreqs) { printf "& %10s ", xrfr; }
                  if (showColFreqs) { printf "& %10s ", xcfr; }
                  printf "\n    "
                }
            }
          printf "\\rstr\n    \\\\\n";
        }
    }
  printf "  \\hline\n";
  printf "\\end{tabular}%%\n"; 
}

function print_table_header(   col,ej,xej,nspan,lbar,algn,rbar)
{
  # Generates column headers
  printf "  %-10s", "~";
  nspan = 0;
  if (showCounts) { nspan++; }
  if (showRowFreqs) { nspan++; }
  if (showColFreqs) { nspan++; }
  # Center the header if the column has a single frequency value:
  if (showCounts || (nspan > 1)) { algn = "r"; } else { algn = "c"; }
  lbar = "|";
  for (col = 1; col <= ncolElems; col++)
    { ej = colElem[col];
      if (ej == "-") 
        { lbar = "|"; }
      else if (ej !~ /^[~\/]$/)
        { rbar = (col == ncolElems ? "|" : "");
          xej = format_elem(ej, "c");
          printf "\n    & %10s\\multicolumn{%d}{%s%s%s}{%s}",
            "",nspan,lbar,algn,rbar,xej;
          lbar = "";
        }
    }
  printf "\\cstr\n    \\\\\n";
  printf "  \\hline\n";
}

function format_elem(e,dir,  x,i,m)
{ 
  if (e == "+") 
    { return ("\\" dir "tot"); }
  else 
    { # replace endmarkers by the appropriate TeX macro:
      m = length(endMarker);
      while((i = index(e,endMarker)) > 0)
        { x = (x substr(e,1,i-1) "{\\" dir "emk}");
          e = substr(e,i+m);
        }
      e = (x e);
      return ("\\" dir "ev{" elemPrefix e "}");
    }
}

function format_count(ct)
{
  if (ct + 0 == 0) 
    { return "\\zeroct"; }
  else
    { return ("\\ct{" sprintf("%d", ct) "}"); }
}

function format_freq(fr)
{ 
  if (fr + 0 < minFreq) 
    { return "\\zerofr"; }
  else
    { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr);
      if (fr >= 1.0) 
        { fr = substr(fr, 1, freqDigits+1); }
      else
        { fr = substr(fr,2); }
      return ("\\fr{" fr "}");
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}