#! /n/gnu/bin/gawk -f

# Usage: "$0 [-v maxlen=MAXLEN] [-v wdefs=WDEFS] [-v ctwd=CTWD] [-v nblocks=NBLOCKS] \
#            [-v percent={0|1}] [v avg={0|1}] < INFILE > OUTFILE"
#
# This script reads from standard input a list of pairs BLOCK WORD OCC,
# where BLOCK is a block number (from 1), WORD is a word occurring
# in that block, and OCC is WORD as it actually occurred in the block. The
# file should be sorted by WORD and then OCC.
#
# If "wdefs" is specified, it also reads from the file called WDEFS a
# list of records of the form
#
#    WORD CLASS SKEY DESCR 
#
# where WORD is a Voynich word, CLASS is a short (3 bytes or less) tag
# for WORD, SKEY is a short key for sorting (4 bytes or less,
# e.g. page number), and DESCR is an arbitrary string (without
# embedded blanks) for WORD.  In case of multiple entries for the same
# word, the last one takes precedence.
# 
# The script prints a table of the form 
#
#   WORD OCC CLASS SKEY AVP DEV TOTCT XXX...XXX YYY...YYY DESCR
#
# where 
#
#   WORD is one word occurring in the input data
#
#   CLASS is the class of WORD as specified in the WEFS file, or "-" if
#     no class was specified;
# 
#   SKEY is the sorting key associated with the words in the WDEFS file,
#   or "-"  if none;
# 
#   AVP and VAR (printed only if avg=1) are the mean and variance
#   of the block number for this word;
#
#   TOTCT is the total number of occurrences of WORD;
#
#   XXX...XXX are the counts of how many times that word
#   occurred in each block (with '.' meaning 0);
#
#   YYY...YYY (printed only if PERCENT is 1) are the same counts
#   expressed as percentages of TOTCOUNT.
#
#   DESCR is the description string associated with the words
#   in the WDEFS file, or "-" if none;
# 
# Each count is printed with CTWD bytes. If CTWD > 1 then the maximum
# value printed MAXCT is 10^(CTWD-1)-1, with at least one leading blank;
# else MAXCT is 9.  The percentages are scaled from
# [0% _ 100%] to [0 _ MAXCT] and rounded.

BEGIN { 
  word = ""; 
  wocc = "";
  if (maxlen == 0) maxlen=16; 
  if (nblocks == 0) nblocks=20
  if (ctwd == 0) ctwd = 1
  if (ctwd == 1) 
    { maxct = 9 }
  else
    { maxct=1; 
      for (i=1;i<ctwd;i++) { maxct = maxct*10 }
      maxct-- 
    } 
  split("", wdclas);
  split("", wdskey);
  split("", wddesc);
  if (wdefs != "")
    {
      while((getline lin < wdefs) > 0) { 
        split (lin, wfld);
        if ((! (4 in wfld)) || (5 in wfld)) 
          { printf "bad wdefs = %s\n", lin > "/dev/stderr"; continue; }
        wd = wfld[1];
        clas = wfld[2];
        skey = wfld[3];
        desc = wfld[4];
        wdclas[wd] = clas;
        wdskey[wd] = skey;
        wddesc[wd] = desc;
      }
      close (wdefs);
    }
}

function avp(c,   i, s, n)
{
  # Computes the average word position from histogram "c"
  s = 0.0
  n = 0
  for (i in c) { s += (i-0.5)*c[i]; n += c[i] }
  return s/n
}

function dev(c, a,    i, d, bias, slop, ss, n)
{
  # Computes the estimated standard deviation of the word position from the 
  # histogram "c" and average position "a"
  
  # The biasterm tries to fix the deviation so that
  # rare words do not come out looking localized.
  ss = 0.0
  n = 0
  for (i in c) 
    { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i] }
  slop = (nblocks-1.0)/n
  bias = (1.0 + slop*slop)/12.0
  return sqrt(ss/n + bias)
}

function printword(w, o, t, c,     i,a,d)
{
  # prints total count "t", word "w", occurrence "o", 
  # average position, deviation, and location map "c"
  
  printf "%-*s ", maxlen, w;
  printf "%-*s ", maxlen, o;
  if (w in wdclas) 
    { clas = wdclas[w]; skey = wdskey[w]; desc = wddesc[w]; }
  else
    { clas = ""; skey = ""; desc = ""; } 
  printf "%3.3s ", (clas == "" ? "-" : clas);
  printf "%4.4s ", (skey == "" ? "-" : skey);
  if (avg)
    { a = avp(c);
      d = dev(c, a);
      printf "%5.1f %5.1f ", a, d;
    }
  printf "%5d ", t;
  for (i=1; i<=nblocks; i++) 
    { if (c[i] == 0) printf "%*s", ctwd, "."
      else if (c[i] >= maxct) printf "%*d", ctwd, maxct
      else printf "%*d", ctwd, c[i]
    }
  if (percent != 0) 
    { printf "  "
      for (i=1; i<=nblocks; i++) 
        { if (c[i] == 0) printf "%*s", ctwd, "."
          else if (c[i] >= t) printf "%*d", ctwd, maxct
          else printf "%*d", ctwd, int((c[i]*maxct)/t + 0.5)
        }
    }
  printf " %s", (desc == "" ? "-" : desc);
  printf "\n"
}

/./ {
  if (($2 != word) || ($3 != wocc))
    { if (word != "") printword(word, wocc, totct, wmap)
      for (i=1; i<=nblocks; i++) wmap[i] = 0;
      totct = 0;
      word = $2;
      wocc = $3;
    }
  totct++
  block = $1
  if ( ((block + 0) != block) || (block < 1) || (block > nblocks) )
    { printf "bad block number = %d\n", block > "/dev/stderr"
      exit 1
    }
  wmap[block]++
}

END { 
  if (word != "") printword(word, wocc, totct, wmap);
}