#! /usr/bin/gawk -f
# Last edited on 2000-12-27 13:26:59 by stolfi

BEGIN{
  abort = -1;
  usage = ( "compute-elem-count-distrib < WORDFILE > FREQFILE " );

  # Input records must be 
  # 
  #   COUNT FWORD LENGTH
  # 
  # where FWORD is factored into elements by "{}", and LENGTH is
  # some integer.  Outputs, for each distinct LENGTH, a line
  # 
  #   LENGTH COUNT FREQ FWORD
  # 
  # where COUNT is the total count for that 
  # LENGTH, FREQ is the relative frequency, and FWORD is a sample
  # FWORD of that length.

  hi = -1;
  split("", mct);
  split("", wsample);
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ {
  if (NF != 3) { data_error(("wrong number of fields")); }
  ct = $1; fw = $2; m = $3;
  if (! (m in mct)) { wsample[m] = fw; }
  mct[m] += ct;
  totct += ct;
  hi = (m > hi ? m : hi);
}

END {
  printf "# len  count   freq example           \n";
  printf "# --- ------ ------ ------------------\n";
  for(m = 1; m <= hi; m++)
    { printf "  %3d %6d %6.4f %s\n", m, mct[m], mct[m]/totct, wsample[m]; }
}

function data_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}
