#! /usr/bin/gawk -f
# 
# Usage: "$0 -v nmin=NNN -v mw=N.NNN mc=N.NNN
#
# Computes the ratio of two counts for a list of patterns.
# The input must be the output of compare-freqs, in the 
# format " NT FT  NL FL  patt", where "NT","NL" are
# two counts, and "FT","FL" the corresponding relative 
# frequencies.  The output will have the format
# " NT FT  NL FL  rat mk patt" where "rat=(NL)/(NT+2)".
#
# The "mk" field is a class code, assigned based on the 
# ratio and its certainty, and the parameters "mw", "mc",
# and "nmin", as follows:

function classify(NT, NL, ratio, nmin, mw, mc)
{
  if (ratio >= 1.0/mw) 
    { if (NT >= nmin) 
        { return "++" }  # Probably word break
      else
        { return "+?" }  # unimportant but looks more like a word break
    }
  else if (ratio >= 0.005)
    { if (NL >= nmin)
        { return "::" }  # possible syllabe break
      else
        { return ":?" }  # uncertain but looks more like syllabe break
    }
  else 
    { if (2*NT < mc) 
        { return "??" }  # too rare, can't tell
      else if (NT < 2*mc) 
        { return "-?" }  # uncertain but looks more like non-break
      else 
        { return "--" }  # non-break
    }
}

/^##/ { 
  $0 = substr($0, 3);
  printf "##%11.11s  %11.11s  RelFr  MK  %s\n", $1, $2, $3; next
}

/^# / { 
  $0 = substr($0, 3);
  printf "# %11.11s  %11.11s  -----  --  %s\n", $1, $2, $3; next
}

/[0-9]\.[0-9]/ { 
  if (mw == 0)   { print "must define mw" > "/dev/stderr"; exit 1; }
  if (mc == 0)   { print "must define mc" > "/dev/stderr"; exit 1; }
  if (nmin == 0) { print "must define nmin" > "/dev/stderr"; exit 1; }
  NT = $1
  NL = $3
  rat = (NL/(NT+2));
  mark = classify(NT, NL, rat, nmin, mw, mc)
  printf "  %5d %5.3f  %5d %5.3f %6.3f  %s  %s\n", $1, $2, $3, $4, rat, mark, $5;
  next
}