#! /usr/bin/gawk -f # Last edited on 2001-01-05 23:29:30 by stolfi BEGIN { abort = -1; usage = ( "compute-row-col-freqs \\\n" \ " [ -v outputTotals=BOOL ] \\\n" \ " < INFILE.pct > OUTFILE.pfr" \ ); # Reads a file with entries of the form COUNT ITEM1:ITEM2 pairs, # as produced by "uniq -c" # Outputs a similar file with lines of the form # COUNT FREQROW FREQCOL FREQTOT ITEM1:ITEM2 # where FREQROW is the fraction of COUNT relative to the total of # all COUNTs with same ITEM1, FREQCOL is the same relative to # ITEM2, and FREQTOT is the frequency relative to all items. # # If outputTotals is 1, also outputs extra pairs of the form GLYPH1:+, # +:GLYPH2, +:+ with the row, column, and table totals, respectively. if (outputTotals == "") { outputTotals = 0; } split("", totrow); split("", totcol); tottbl = 0; split("", count); split("", pair); n = 0; } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { count[n] = "#"; pair[n] = $0; n++; next; } // { if (NF != 2) { fatal_error(("line " NF ": bad input format = «" $0 "»")); } ct = $1; pr = $2; count[n] = ct; pair[n] = pr; n++; nitems = split(pr, item, ":"); if (nitems != 2) { fatal_error(("line " NF ": bad pair format = «" pr "»")); } totrow[item[1]] += ct; totcol[item[2]] += ct; tottbl += ct; next; } END { if (abort >= 0) { exit abort; } dentbl = ( tottbl != 0 ? tottbl : 1 ) for (i=0; i<n; i++) { pr = pair[i]; ct = count[i]; if (ct == "#") { print pr; } else { nitems = split(pr, item, ":"); if (nitems != 2) { fatal_error(("line " NF ": bad pair format = «" pr "»")); } denrow = ( totrow[item[1]] == 0 ? 1 : totrow[item[1]] ); dencol = ( totcol[item[2]] == 0 ? 1 : totcol[item[2]] ); printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr; } } if (outputTotals) { for (it in totrow) { pr = (it ":+"); ct = totrow[it]; denrow = ( totrow[it] == 0 ? 1 : totrow[it] ); dencol = dentbl; printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr; } for (it in totcol) { pr = ("+:" it); ct = totcol[it]; denrow = dentbl; dencol = ( totcol[it] == 0 ? 1 : totcol[it] ); printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr; } pr = "+:+"; ct = tottbl; denrow = dentbl; dencol = dentbl; printf "%7d %7.5f %7.5f %7.5f %s\n", ct, ct/denrow, ct/dencol, ct/dentbl, pr; } } function fatal_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit abort; }