#! /usr/bin/gawk -f
# 
# Usage: "$0 -v nmin=NNN -v mw=N.NNN mc=N.NNN
#
# Computes the ratio of two counts for a list of patterns.
# The input must be the output of compare-freqs, in the 
# format " NT FT  NL FL  patt", where "NT","NL" are
# two counts, and "FT","FL" the corresponding relative 
# frequencies.  The output will have the format
# " NT FT  NL FL  rat mk patt" where "rat=(NL+1)/(NT+mc)".
#
# The "mk" field is a class code, assigned based on the 
# ratio and its certainty, and the parameters "mw", "mc",
# and "nmin", as follows:

function classify(NT, NL, nmin, mw, mc)
{
  if      ((NT < nmin) && (2*mw*(NL+1) < (NT+mc))) 
    { return "-?" }  # unimportant but NL low
  else if ((NT < nmin) && (2*mw*(NL+1) >= (NT+mc))) 
    { return "+?" }  # unimportant but NL high
  else if (mw*(NL+1) < nmin)
    { return "oo" }  # NL practically zero  
  else if ((NL-1) > NT - nmin)
    { return "##" }  # NL practically NT
  else if (mw*(NL-1) > NT - nmin)
    { return "||" }  # NL practically maximum expected
  else if (2*mw*(NL+1) < (NT+mc))
    { return "--" }  # NL on the low side
  else if (2*mw*(NL+1) >= (NT+mc))
    { return "++" }  # NL on the high side
  else
    { return "!!" }  # program error
}

/^##/ { 
  $0 = substr($0, 3);
  printf "##%11.11s  %11.11s  RelFr  MK  %s\n", $1, $2, $3; next
}

/^# / { 
  $0 = substr($0, 3);
  printf "# %11.11s  %11.11s  -----  --  %s\n", $1, $2, $3; next
}

/[0-9]\.[0-9]/ { 
  if (mw == 0)   { print "must define mw" > "/dev/stderr"; exit 1; }
  if (mc == 0)   { print "must define mc" > "/dev/stderr"; exit 1; }
  if (nmin == 0) { print "must define nmin" > "/dev/stderr"; exit 1; }
  NT = $1
  NL = $3
  rat = ((NL+1)/(NT+mc));
  mark = classify(NT, NL, nmin, mw, mc)
  printf "  %5d %5.3f  %5d %5.3f %6.3f  %s  %s\n", $1, $2, $3, $4, rat, mark, $5;
  next
}