#! /usr/bin/gawk -f
# Last edited on 2012-02-14 01:49:46 by stolfilocal
BEGIN {
# Reads a table "{COUNT} {OLD_SPELLING} {SEP} {NEW_SPELLING}".
# Selects the most likely mapping for each {OLD_SPELLING}.
# Writes "{OLD_SPELLING} ! {NEW_SPELLING} # {FRACTION}"
abort = -1;
split("", o_npairs); # Indexed by {oldsp}, gives number of new spellings for {oldsp}.
split("", o_newsp); # Indexed by {oldsp}, gives new spelling of {oldsp} with argest count.
split("", o_maxct); # Indexed by {oldsp}, gives count of that {oldsp,newsp} pair.
split("", o_totct); # Indexed by {oldsp}, gives total count of {oldsp}.
}
(abort >= 0) { exit abort; }
/^[ ]*[0-9]+[ ].*[ ][=|<>()][ ].*$/ {
if (NF != 4) { data_error(("bad format")); }
count = $1;
oldsp = $2;
opcmp = $3;
newsp = $4;
if ((opcmp != "<") && (opcmp != ">"))
{
if (! (oldsp in o_npairs))
{ o_npairs[oldsp] = 0;
o_maxct[oldsp] = 0;
o_totct[oldsp] = 0;
};
if (count > o_maxct[oldsp])
{ o_newsp[oldsp] = newsp;
o_maxct[oldsp] = count;
}
o_totct[oldsp] += count;
o_npairs[oldsp]++;
}
next;
}
// {
data_error(("bad format"));
}
END {
if (abort >= 0) { exit abort; }
nold = asorti(o_totct, k_oldsp);
for (k = 1; k <= nold; k++)
{ oldsp = k_oldsp[k]
np = o_npairs[oldsp];
totct = o_totct[oldsp];
newsp = o_newsp[oldsp];
maxct = o_maxct[oldsp];
if (np <= 0) { prog_error(("duh?")); }
if (totct <= 0) { prog_error(("duh?")); }
fract = maxct/totct;
if (fract >= 0.800)
{ if (oldsp != newsp)
{ printf "%-30s %-30s # %8.6f\n", oldsp, newsp, fract; }
}
else
{ printf "ambiguous: %-20s %-20s # %8.6f\n", oldsp, newsp, fract > "/dev/stderr"; }
}
}
function prog_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
abort = 1;
exit 1;
}
function data_error(msg)
{
printf "\n" > "/dev/stderr";
printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
printf " %s\n", $0 > "/dev/stderr";
abort = 1; exit 1;
}