#! /usr/bin/gawk -f # Last edited on 2012-02-14 01:49:46 by stolfilocal BEGIN { # Reads a table "{COUNT} {OLD_SPELLING} {SEP} {NEW_SPELLING}". # Selects the most likely mapping for each {OLD_SPELLING}. # Writes "{OLD_SPELLING} ! {NEW_SPELLING} # {FRACTION}" abort = -1; split("", o_npairs); # Indexed by {oldsp}, gives number of new spellings for {oldsp}. split("", o_newsp); # Indexed by {oldsp}, gives new spelling of {oldsp} with argest count. split("", o_maxct); # Indexed by {oldsp}, gives count of that {oldsp,newsp} pair. split("", o_totct); # Indexed by {oldsp}, gives total count of {oldsp}. } (abort >= 0) { exit abort; } /^[ ]*[0-9]+[ ].*[ ][=|<>()][ ].*$/ { if (NF != 4) { data_error(("bad format")); } count = $1; oldsp = $2; opcmp = $3; newsp = $4; if ((opcmp != "<") && (opcmp != ">")) { if (! (oldsp in o_npairs)) { o_npairs[oldsp] = 0; o_maxct[oldsp] = 0; o_totct[oldsp] = 0; }; if (count > o_maxct[oldsp]) { o_newsp[oldsp] = newsp; o_maxct[oldsp] = count; } o_totct[oldsp] += count; o_npairs[oldsp]++; } next; } // { data_error(("bad format")); } END { if (abort >= 0) { exit abort; } nold = asorti(o_totct, k_oldsp); for (k = 1; k <= nold; k++) { oldsp = k_oldsp[k] np = o_npairs[oldsp]; totct = o_totct[oldsp]; newsp = o_newsp[oldsp]; maxct = o_maxct[oldsp]; if (np <= 0) { prog_error(("duh?")); } if (totct <= 0) { prog_error(("duh?")); } fract = maxct/totct; if (fract >= 0.800) { if (oldsp != newsp) { printf "%-30s %-30s # %8.6f\n", oldsp, newsp, fract; } } else { printf "ambiguous: %-20s %-20s # %8.6f\n", oldsp, newsp, fract > "/dev/stderr"; } } } function prog_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "\n" > "/dev/stderr"; printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit 1; }