#! awk -f
# Last edited on 1998-07-17 06:20:41 by stolfi

BEGIN{
  usage = "compute-chunkiness < TABLE > LIST"
  
  # Reads a table with entries of the form COUNT FNUM WORD PNUM
  # Prints a list FLATNESS WORD wher FLATNESS is 0 for 
  # words that occur only in one page,and maximum for
  # evenly distributed words.
  
  abort = -1;
  
  # Number of distinct pages and words
  nw = 0; np = 0;
  # Number of distinct pages per word and distinct words per pages
  split("", nw_p);
  split("", np_w);
  
  # Number of word occurrences
  no = 0;
  # Number of word occurrences per page and per word
  split("", no_w);
  split("", no_p);
  # Number of word occurrences per (page,word) pair
  split("", no_wp);
  
  # Lazy table "n -> log(n!)"
  split("", lfac); lfac[0] = 0;
}

/./{
  if (abort >= 0) { exit abort; }
  if(NF != 4) { error(("line " NR ": bad format")); }
  o = $1; p = $2; f = $3; w = $4;
  if((w,p) in no_wp) { error(("line " NR ": repeated entry")); }
  no_wp[w,p] = o;
  if (! (p in no_p)) { ff[np] = f; pp[np] = p; np++; no_p[p] = 0; }
  no_p[p] += o; 
  if (! (w in no_w)) { ww[nw] = w; nw++; no_w[w] = 0; }
  no_w[w] += o;
  no += o;
  next;
}

END{
  if (abort >= 0) { exit abort; }
  for (w in no_w)
    { 
      # Compute the strangeness of word "w"
      m = no;
      mw = no_w[w];
      wen = logfac(mw) - (mw/m)*logfac(m);
      printf "  %8.3f  %3d/%3d  %-6s %-15s\n", \
        wen/log(2), no_w[w], no, "", w  >  "/dev/stderr";
      for (p in no_p)
        { mwp = no_wp[w,p]; mp = no_p[p];
          wpen = (mw/m)*logfac(mp) - logfac(mwp);
          printf "  %8.3f  %3d/%3d  %-6s %-15s\n", \
            wpen/log(2), no_wp[w,p], no_p[p], p, w  >  "/dev/stderr";
          wen += wpen;
        }
      wen = wen/log(2);
      printf"%8.3f %s\n", wen, w; 
      printf"%8.3f %s\n", wen, w >  "/dev/stderr"; 
    }
}

function logcomb(v,  i,k,m,s)
{
  # log of the number of ways of tossing "m" items 
  # into "k" buckets such that the total counts
  # are "v[i]", where "k" is the number of entries 
  # of "v" and "m" is their sum.
  k = 0; m = 0;
  for (i in v) { k++; m += v[i]; }
  s = logfac(m);
  for (i in v) { s -= logfac(v[i]); }
  return (s);
}

function logfac(n,   i,s)
{
  # log of factorial of "n"
  i = n + 0;
  while (!(i in lfac)) 
    { if (i == 0) { error("bang " n); } i--; }
  s = lfac[i];
  while (i < n) { i++; s += log(i); lfac[i] = s; }
  return (s);
}

function error(msg)
{ printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 0;
}