#! /n/gnu/bin/gawk -f
# Last edited on 1999-01-16 07:13:50 by stolfi

#   Reads a file containing records with the format
#
#     FREQ PNUM STRING 
#     1    2    4
# 
#   where
# 
#     PNUM       is a sequential page number, "001" to "234".
#
#     STRING     is a non-empty string.
#   
#     FREQ       is a count of occurrences of STRING on page PNUM.
#
#   Outputs a file containing, for each distinct STRING, one record
#   with the format
#
#     TOTFR MAXFR SPECF PMAX STRING
#     1     2     3     4    6
#
#   where 
#
#     TOTFR      is the total occurrences of STRING.
#
#     MAXFR      is the maximum occurrence count in any page.
#
#     SPECF      is the ratio MAXFR/TOTFR
#     
#     PMAX       is one of the pages where STRING occurs with freq. MAXFR.
#

function clear_counts()
{
  # Resets all page and total counts to zero 
  totct = 0;
  maxct = 0;
  maxpg = "???";
  maxfl = "f???";
}

function output_word()
{
  # Writes an output record 
  specf = maxct/totct
  printf "%7d %7d %5.3f %s %s %s\n", totct, maxct, specf, maxpg, maxfl, curwd;
}

BEGIN {
  abort = -1;
  curwd = "";
  clear_counts();
}

(abort >= 0) { exit; }

/./ {
  ct = $1;
  pg = $2;
  wd = $3;
  if (wd != curwd)
    { if (curwd != "") output_word();
      clear_counts();
      curwd = wd;
    }
  
  totct += ct;
  if (ct >= maxct) {maxct = ct; maxpg = pg; maxfl = fl;}
  next;
}