#! /usr/bin/gawk -f
# Last edited on 1999-01-30 17:30:58 by stolfi

BEGIN {
  abort = -1;
  usage = ( "list-page-champs [ -v maxChamps=NUM ] < INFILE > OUTFILE" );
  
  # Reads from INFILE recors in the format
  #
  #   COUNT SEC PNUM FNUM WORD
  #
  # where SEC is a section tag, PNUM is a page's p-number,
  # 
  # Assumes they are grouped by SEC and PNUM and then sorted by COUNT
  # decreasing. Writes for each page a record of the form
  #
  #   SEC PNUM FNUM W1(C1) W2(C2) ... Wn(Cn)
  #
  # where W1, W2, ... Wn are the NUM (default 1) most popular WORds 
  # in page PNUM, and C1, C2, ... Cn are the respective COUNTs. 

  if (maxChamps == "") { maxChamps = 1; }
  
  cur_key = "";
  split("", top_wd);
  split("", top_ct);
}

(abort >= 0) { exit abort; }

/./ {
  ct = $1
  sc = $2;
  pn = $3;
  fn = $4; 
  wd = $5;
  key = (sc " " pn " " fn)
  if (key != cur_key)
    { flush_page();
      nChamps = 0; 
      cur_key = key;
      last_ct = 999999;
    }
  if (last_ct < ct) { error("out of order"); }
  if (nChamps < maxChamps) 
    { top_wd[nChamps] = wd; top_ct[nChamps] = ct; nChamps++; }
}

END {
  if (abort >= 0) { exit abort; }
  flush_page();
}

function flush_page(   i)
{
  if (cur_key != "") 
    { printf "%-14s", cur_key;
      for(i=0; i<nChamps; i++)
        { printf " %s(%d)", top_wd[i], top_ct[i]; }
      printf "\n";
    }
}
      
function error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}