#! /usr/bin/gawk -f
# Last edited on 1999-07-21 17:41:38 by stolfi

BEGIN {
  usage = "estimate-freqs -v nItems=NITEMS < INFILE > OUTFILE";

  # Reads a file of COUNT ITEM pairs, as produced by "uniq -c",
  # that respresents a finite sample of some discrete random variable.
  # Outputs a similar file with COUNT FREQ ITEM lines, where FREQ is
  # the estimated frequency of ITEM in the variable.
  # The estimator used is (COUNT + 1) / (TOTCOUNT + NITEMS),
  # where TOTCOUNT is the sampel size (sum of all COUNTs), and 
  # NITEMS is the number of distinct values that the variable 
  # may nominally assume.
  #
  # Also outputs an extra line with COUNT = 0, 
  # FREQ = 1/(TOTCOUNT + NITEMS), and ITEM = "*ETC*",
  # representing all ITEM values that are not listed 
  # explicitly.

  abort = -1;
  if (nItems == "") { fatal_error("must define \"nItems\""); }
  totCount = 0;
  n = 0; xItems = 0;
}

(abort >=0 ) { exit abort; }

($2 == "*ETC*") { next; }

/^([#]|[ ]*$)/ {
  ct[n] = "#"; it[n] = $0;
  n++;
  next;  
}

// {
  if (NF != 2) { fatal_error(("line " NF ": bad input format")); }
  totCount += $1;
  ct[n] = $1; it[n] = $2;
  n++; xItems++;
  next;  
}

END {
  if (abort >= 0) { exit abort; }
  if (xItems > nItems) { fatal_error("too many items"); }
  den = (totCount + nItems);
  for (i=0; i<n; i++)
    { if (ct[i] == "#")
        { print it[i]; }
      else
        { printf "%7d %7.5f %s\n", ct[i], (ct[i]+1)/den, it[i]; }
    }
  printf "%7d %7.5f %s\n", 0, 1/den, "*ETC*";
}
  
function fatal_error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}