#! /usr/bin/gawk -f
# Last edited on 1999-07-28 02:01:26 by stolfi

# Reads a a sequence of records of the form COUNT WORD FNUM
# where COUNT is the number of occurrences of WORD on page FNUM.
#
# Assumes each page is a finite random sample of some 
# frequency distribution over all words, that is characteristic
# of that page.  The distribution is estimated by Bayesian
# inference assuming all distributions over the full vocabulary
# (plus one "absent" word) are equally likely.
#
# Outputs for each WORD a line with TOTCOUNT BITS WORD, where TOTCOUNT
# is the total number of occurrences of WORD, and BITS is the estimated
# number of bits of information that the occurrence of WORD in a gives
# about the page's identity, computed from those estimated per-page
# frequency distributions.

BEGIN {
  abort = -1;
  split("", ct_wp);
  split("", ct_p);
  split("", ct_w);
  nw = 0; np = 0;
  printf "reading per-page word counts...\n"  > "/dev/stderr";
}

(abort >= 0) { exit abort; }

/./{
  if (NF != 3) { error("bad NF"); }
  c = $1;
  w = $2;
  p = $3;
  
  if ((w,p) in ct_wp) { error(("repeated pair " w " " p)); }
  ct_wp[w,p] += c;
  
  if (! (p in ct_p)) { np++; }
  ct_p[p] += c;
  
  if (! (w in ct_w)) { nw++; }
  ct_w[w] += c;
}

END {
  if(abort >= 0) { exit abort; }
  printf "computing information per word...\n"  > "/dev/stderr";
  for (w in ct_w)
    { 
      # Below, "f" if the estimated frequency of word "w" in the 
      # language of page "p", and 
      # "totf" is the sum of "f" for word "w" over all pages "p".
      # Also "toth" if the sum of "f*ln(f)" over 
      # The estimated probability "q" of page "p" given that a randomly
      # picked word is "w" is then "f/totf".  
      totf = 0;
      toth = 0;
      for (p in ct_p)
        { if ((w,p) in ct_wp)
            { f = (ct_wp[w,p]+1)/(ct_p[p] + nw + 1); }
          else
            { f = 1/(ct_p[p] + nw + 1); }
          # printf "%8.5f %s\n", f, p > "/dev/stderr";
          totf += f;
          toth += -f*log(f)
        }
      # The estimated entropy of the page "p", given
      # the word "w", is then the sum of "q*lg(q)" over all 
      # pages; which turns out to be
      h = (toth / totf + log(totf))/log(2);
      # printf "------- -----------\n%7.5f %s\n\n", h, w  > "/dev/stderr";
      # The information given by "w" about "p" is then
      bits = log(np)/log(2) - h;
      printf "%7d %7.4f %s\n", ct_w[w], bits, w;
    }
}

function error(msg)
{
  printf "%s\n", msg > "/dev/stderr"; 
  abort = 1; exit 1;
}