#! /usr/bin/gawk -f
# Last edited on 1999-01-18 22:34:17 by stolfi

BEGIN {
  abort = -1;
  
  # Reads a file with records COUNT PAGE WORD where
  # COUNT is the number of occurrences of WORD on page PAGE.
  #
  # Computes the histogram of each word over all pages.
  #
  # Writes a file with records of the form
  #
  #    WORD TOTCT NPAGES NMISS SHAPE 
  #
  # where TOTCT is the total occurrence count of the string, NPAGES is
  # the number of pages where the word occurs, NMISS is th enumber of
  # pages where the word doesn't occur, and SHAPE is the shape of the
  # word's per-page distribution, defined here as the multiset of the
  # nonzero per-page counts of that word, sorted in decreasing order.
  
  split("", pwct);
  split("", wct);
  split("", pct);
  nwords = 0;
  npages = 0;
}

(abort >= 0) { exit abort; }

(NF != 3) { file_error("wrong num of fields"); }

/./ { 
  n = $1; p = $2; w = $3; 
  if ((p,w) in pwct) { file_error("repeated word/page pair"); }
  pwct[p,w] += n;
  if (! (w in wct)) { nwords ++; }
  wct[w] += n;
  if (! (p in pct)) { npages ++; }
  pct[p] += n;
  ct += n;
}

END {
  if (abort >= 0) { exit abort; }
  
  for (w in wct)
    { split("", shape);
      ns = 0;
      for (p in pct)
        { if (pwct[p,w] != 0) { shape[ns] = pwct[p,w]; ns++; } }
      
      # Sort entries:
      for (i=0; i<ns; i++)
        { for (j=i+1; j<ns; j++)
            { if (shape[i] < shape[j])
                { t = shape[i]; shape[i] = shape[j]; shape[j] = t; }
            }
        }
      
      # Join histogram into a string:
      ss = "";
      for (i=0; i<ns; i++) { ss = ( ss (i==0 ? "(" : "," ) shape[i] ); }
      ss = (ss ")");
      printf "%s %d %d %d %s\n", w, wct[w], ns, npages - ns, ss;
    }
}
  
function file_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}