#! /usr/bin/gawk -f # Last edited on 2000-02-02 21:24:42 by stolfi BEGIN { abort = -1; # Reads a file of word counts, as produced by "uniq -c", # divided into section by blank lines and/or #-comments. # # Outputs a similar file, in the format # # COUNT SCUMCT SFRAC SCUMFR FCUMCT FFRAC FCUMFR WORD... # # where COUNT WORD... is the original record, SCUMCT and FCUMCT are # the cumulative counts per section and per file, SFRAC and FFRAC # are the fractions of COUNT relative to the section and file # totals, and SCUMFR and FCUMFR are the corresponding cumulative # fractions. # # Preserves blank lines and comments. split("", w); split("", ct); nr = 0; OFS = " "; } # Slurp input file: (abort >= 0) { exit abort; } /^ *([#]|$)/ { ct[nr] = "#"; w[nr] = $0; nr++; next; } //{ if (NF != 2) { error(("line " NF ": bad format")); } ct[nr] = $1; $1 = ""; w[nr] = $0; nr++; next; } # Print it out: END { # Compute total totalorum: totF = 0; for(i=0; i<nr; i++) { if (ct[i] != "#") { totF += ct[i]; } } if (totF == 0) { totF = 1; } # Loop on sections: i = 0; cumF = 0; while (i < nr) { if (ct[i] == "#") { print w[i]; i++; } else { # Locate end of section and totalize: j = i; totS = 0; while ((j < nr) && (ct[j] != "#")) { totS += ct[j]; j++; } if (totS == 0) { totS = 1; } # Now print data with frequencies: cumS = 0; while (i < j) { V = ct[i]; cumS += V; cumF += V; printf "%7d %7d %6.4f %6.4f %7d %6.4f %6.4f%s\n", V, cumS, (V/totS), (cumS/totS), cumF, (V/totF), (cumF/totF), w[i]; i++; } } } } function error(msg) { printf "*** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }