#! /n/gnu/bin/gawk -f

# usage 
#   compute-strangeness \
#     -v wordCounts=WCNTFILE \
#     -v pageCounts=PSIZFILE \
#     < WORDPAGE.pwct \
#     > WORDPAGE.pwst
#
#  The input file must have fields WPCOUNT FNUM WORD
#  where WPCOUNT is the number of occurrences of WORD in FNUM.
#
#  The output will have fields WPCOUNT WCOUNT PCOUNT STRANG FNUM WORD
#  where PCOUNT is the number of words in the page,
#  WCOUNT is the total count for WORD in the whole book,
#  and STRANG is a measure of how anomalous is the 
#  WPCOUNT, given the page's pcount and overall probability
#  of WORD.

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

BEGIN {
  abort = 0;

  # Read word count table:
  if (wordCounts == "") 
    { error("must specify \"-v wordCounts=FILE\"\n"); }
  split("", wCount);
  nWords = 0; nOccsW = 0;
  while((getline lin < wordCounts) > 0) { 
    split(lin, fld);
    if ((3 in fld) || ! (2 in fld)) 
      { error("bad wordCounts entry = \"" lin "\""); }
    if (fld[1] in wCount) 
      { error("repeated key = \"" lin "\""); }
    wCount[fld[1]] = fld[2];
    nWords++; nOccsW += fld[2];
  }
  close (wordCounts);
  printf "loaded %6d word counts (%d word occurrences)\n",
    nWords, nOccsW > "/dev/stderr";

  # Read word count per page table:
  if (pageCounts == "") 
    { error("must specify \"-v pageCounts=FILE\"\n"); }
  split("", pCount);
  nPages = 0; nOccsP = 0;
  while((getline lin < pageCounts) > 0) { 
    split(lin, fld);
    if ((3 in fld) || ! (2 in fld)) 
      { error("bad pageCounts entry = \"" lin "\""); }
    if (fld[1] in pCount) 
      { error("repeated key = \"" lin "\""); }
    pCount[fld[1]] = fld[2];
    nPages++; nOccsP += fld[2];
    }
  close (pageCounts);
  printf "loaded %6d page sizes (%d word occurrences)\n", 
    nPages, nOccsP > "/dev/stderr";
    
  if (nOccsW != nOccsP) { error("incongruent occurrence counts\n"); }
  nOccs = nOccsW;

  # printf "%6s %6s %6s %6s %-6s %s\n", 
  #       "#[w,p]", "#[w]", "#[p]", "str", "fnum", "word" > "/dev/stderr";
  # printf "%6s %6s %6s %6s %-6s %s\n", 
  #       "------", "------", "------", "------", "------", "------" > "/dev/stderr";
}

function strangeness(m, n, p,  q,f,g,s,t,e)
{
  # Computes the strangeness of having m or more occurrences
  # of a word in n trials, given that the word does occur
  # at least once, and given that the overall frequency
  # of the word in the book is p. 
  e = 1/(n+1);
  p = sqrt((p*p + e*e)/(1 + e*e));
  q = 1-p;
  f = m/(n+1); g = 1-f;
  s = log(f/p); t = log(g/q);
  return s-t;
}

/./ {
  wpct = $1;
  fnum = $2;
  word = $3; 
  if (!(fnum in pCount)) { error("page \"" fnum "\" not in page size table\n"); }
  pct = pCount[fnum];
  if (!(word in wCount)) { error("word \"" word "\" not in word count table\n"); }
  wct = wCount[word];
  str = strangeness(wpct, pct, wct/nOccs);
  print wpct, wct, pct, sprintf("%6.4f", str), fnum, word;
  next;
}