#! /usr/bin/gawk -f
# Last edited on 1999-07-18 07:43:53 by stolfi

# Reads a a sequence of records of the form COUNT WORD FNUM
# where COUNT is the number of occurrences of WORD on page FNUM.
#
# Assumes each page is a finite random sample of some 
# frequency distribution over all words, that is characteristic
# of that page.  The distribution is estimated by Bayesian
# inference assuming all distributions over the full vocabulary
# (plus one "absent" word) are equally likely.
#
# Outputs for each WORD a line with TOTCOUNT NPAGES INDEX WORD, where TOTCOUNT
# is the total number of occurrences of WORD, NPAGES is the number
# of pages where the word occurs, and INDEX is an indicator
# of how lumpily is the WORD distributed over the pages.  

BEGIN {
  abort = -1;
  split("", ct_wp);
  split("", ct_p);
  split("", ct_w);
  ct = 0;
  nw = 0; np = 0;
  split("", log_comb);
  printf "reading per-page word counts...\n"  > "/dev/stderr";
}

(abort >= 0) { exit abort; }

/./{
  if (NF != 3) { error("bad NF"); }
  c = $1;
  w = $2;
  p = $3;
  
  if ((w,p) in ct_wp) { error(("repeated pair " w " " p)); }
  ct_wp[w,p] += c;
  
  if (! (p in ct_p)) { np++; }
  ct_p[p] += c;
  
  if (! (w in ct_w)) { nw++; }
  ct_w[w] += c;
  
  ct += c;
}

END {
  if(abort >= 0) { exit abort; }
  printf "computing information per word...\n"  > "/dev/stderr";
  for (w in ct_w)
    { 
      N = ct_w[w];
      totq = 0;
      nwp = 0;
      for (p in ct_p)
        { 
          n = ct_p[p];
          if ((w,p) in ct_wp)
            { m = ct_wp[w,p]; nwp++; }
          else
            { m = 0; }
          q = (m+1)*m*(m-1);
          # if (q > 0) { printf "%7d %7d %8.4f %s\n", m, n, q, w  > "/dev/stderr"; }
          totq += q;
        }
      x = totq/(N*N*N);
      # printf "------- ------- -------- -----------\n" > "/dev/stderr";
      # printf "%7d %7d %8.4f %s\n\n", ct_w[w], nwp, x, w  > "/dev/stderr";
      if (x > 0) { printf "%7d %7d %8.4f %s\n", ct_w[w], nwp, x, w; }
    }
}

function error(msg)
{
  printf "%s\n", msg > "/dev/stderr"; 
  abort = 1; exit 1;
}