#! /usr/bin/gawk -f
# Last edited on 2012-05-05 14:36:07 by stolfilocal

BEGIN {
  abort = -1;
  usage = ( "tex-format-word-freqs-by-section \\\n" \
    "  [ -v showCounts=BOOL ] \\\n" \
    "  [ -v showFreqs=BOOL ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Tabulates the counts and/or frequencies of words by section, as a LaTeX table.
  # Assumes the input records have fields 
  # 
  #   SECTAG COUNT FREQ WORD
  # 
  # where WORD is in EVA, and SECTAG is a section tag
  # like "bio.1" or "txt.n".  The output is formatted as
  # `ncols' columns, one per section, in the order seen in the input.

  if (showCounts == "") { showCounts = 1; }
  if (showFreqs == "") { showFreqs = 1; }

  # These arrays are indexed by s = [0..ns-1] 
  split("", nw);
  split("", tag);
  
  # These arrays are indexed with the section tag:
  split("", sindex);
  
  # These arrays are indexed by s = [0..ns-1] and r = [0..nw[s]-1]
  split("", wct);
  split("", wfr);
  split("", wrd);
  
  ns = 0;
  nrows = 0;
}

(abort >= 0) { exit abort; }

/^ *([\#]|$)/ { next; }

/./ { 
  if (NF != 4) { data_error(("bad line format = |" $0 "|")); }
  tg = $1; ct = $2; fr = $3; w = $4;
  if (match(w, /[^\\][$&% _\^#]/))
    { data_error(("bad word \"" w "\" - has \"" substr(w,RSTART,RLENGTH) "\"")); }
  if (! (tg in sindex)) 
    { s = ns; ns++; tag[s] = tg; nw[s] = 0; sindex[tg] = s; }
  else
    { s = sindex[tg]; }
  r = nw[s]; 
  nw[s]++;
  wct[s,r] = ct;
  wfr[s,r] = fr;
  wrd[s,r] = w;
  if (nrows < nw[s]) { nrows = nw[s]; }
  next;
}

END { 
  if (abort >= 0) { exit abort; }
  print_word_table();
}

function print_word_table(   s,r,nspan,bar,w,ct,fr,xw,xct,xfr)
{
  printf "%% Created by %s\n", ARGV[0];

  # Table header:
  printf "\\begin{tabular}{|";
  for (s = 0; s < ns; s++)
    { if (showCounts) { printf "r"; }
      if (showFreqs)  { printf "r"; }
      printf "l|";
    }
  printf "} \\hline\n";
  
  # Column headers:
  nspan = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0) + 1;
  printf "  ";
  for (s = 0; s < ns; s++)
    { if (s > 0) { printf " &\n  "; }
      bar = (s == ns-1 ? "|" : "");
      printf "\\multicolumn{%d}{|c%s}{{\\tt %s}}", nspan, bar, tag[s];
    }
  printf " \\\\ \\hline\n";
  
  # Table entries:
  for (r = 0; r < nrows; r++)
    { printf "  ";
      for (s = 0; s < ns; s++)
        { if (r < nw[s])
            { w = wrd[s,r]; ct = wct[s,r]; fr = wfr[s,r];
              xct = ("\\ct{" sprintf("%d", ct) "}"); 
              xfr = ("\\fr{" substr(sprintf("%5.3f", fr),2) "}");
              xw =  ("\\ev{" w "}");
            }
          else
            { w = ""; ct = 0;
              xct = "";
              xfr = "";
              xw =  (i == nw+1 ? "\\dots" : "");
            }
          printf "  ";
          if (showCounts) { printf "%10s &", xct; }
          if (showFreqs) { printf "%10s &", xfr; }
          printf "%-10s ", xw;
          if (s == ns-1) 
            { printf "\\str\\\\\n"; }
          else
            { printf "&\n  "; }
        }
    }
  printf "  \\hline\n"; 
  printf "\\end{tabular}%%\n"; 
}

function arg_error(msg)
{ 
  printf "%s\n", NR, msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}