#! /usr/bin/gawk -f # Last edited on 2026-03-03 14:30:28 by stolfi BEGIN { abort = -1; usage = ( "compute-freqs \\\n" \ " [ -v outputTotal=STRING ] \\\n" \ " [ -v countField=NUM ] [ -v freqField=NUM ] \\\n" \ " < INFILE.wct > OUTFILE.wfr" \ ); # Reads a file of whose lines contain a {COUNT} field. # Outputs a similar file with an additional {FREQ} field which is # the fraction of {COUNT} relative to the total of all {COUNT}s. # # The {COUNT} may be fractiona. # # The {countField} is the index (from 1) of the count field in the # input records. If not specified, it defaults to 1. # # The {freqField} is the index (from 1) where the frequency # should be inserted. It should be greater than {countFiald}. # If not specified, it defaults to {countField+1} # # If neither {countField} nor freqField} are specified, # the program expects a file whose lines are "{COUNT} {ITEM}..", # as produced by "uniq -c". # # If {outputTotal} is not empty, the program outputs an extra line # with all fields equal to '-', except that # the field {countField} is the sum of all input {COUNT}s, # the field {freqField} is 1.0, and field {freqField+1} is # the string {outputTotal}. if (countField == "") { countField = 1; } else {countField += 0 } if (freqField == "") { freqField = countField + 1; } else {freqField += 0 } if (countField < 1) { arg_error(("bad {countField} = " countField "")) } if (freqField < 1) { arg_error(("bad {freqField} = " freqField "")) } total = 0 nread = 0 nfmax = 1 precmax = 0 # Max nonzero frac digits in {COUNT} split("", lins) } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { ct[nread] = "#"; lins[nread] = $0; nread++; next; } // { if (NF < freqField-1) { data_error(("bad NF = " NF "")); } if (NF > nfmax) { nfmax = NF } count = $(countField) if (! match(count, /^[0-9]*[0-9.][0-9]*$/)) { data_error(("invalid count = «" count "»")) } total += count; ct[nread] = count; lins[nread] = $0; prec = frac_digits(count) if (prec > precmax) { precmax = prec } nread++; next; } END { if (abort >= 0) { exit abort; } debug = 0 # Determine count and frequency formats: idigs = int_digits(total) if (precmax > 0) { ctsize = idigs + 1 + precmax ctfmt = ( "%" ctsize "." precmax "f" ) ctzero = sprintf(("%" ctsize "s"), sprintf(("%-" precmax +1 "s"), ".")) } else { ctsize = idigs ctfmt = ( "%" ctsize "d" ) ctzero = sprintf(ctfmt, 0) } if (debug) { printf "!! ctfmt = '%s' ctzero = '%s'\n", ctfmt, ctzero > "/dev/stderr" } frfmt = "%7.5f" # Should do better... frzero = " . " if (debug){ printf "!! frfmt = '%s' frzero = '%s'\n", frfmt, frzero > "/dev/stderr" } for (i = 0; i < nread; i++) { if (ct[i] == "#") { print lins[i]; } else { # if (debug) { printf "!! lins[%d] = '%s'\n", i, lins[i] > "/dev/stderr" } nf = split(lins[i], fld) # if (debug) { printf "!! nf = %d fld[nf] = '%s'", nf, fld[nf] > "/dev/stderr" } output_line(nf,fld, ct[i], ctfmt, ctzero, total, frfmt, frzero) } } if (outputTotal != "") { split("", fld) if (nfmax <= countField) { nfmax = countField + 1 } for (j = 1; j <= nfmax; j++) { fld[j] = "-" } fld[countField+1] = outputTotal output_line(nfmax,fld, total, ctfmt, ctzero, total, frfmt, frzero) } } function output_line(nf,fld,cti,ctfmt,ctzero,total,frfmt,frzero, fr,ctx,frx,i,fldi) { if (cti+0 == 0) { ctx = ctzero; frx = frzero } else { ctx = sprintf(ctfmt, cti) frx = sprintf(frfmt, (total != 0 ? cti/total : 0)) } fld[countField] = ctx for (i = 1; i <= nf+1; i++) { if (i == freqField) { fldi = frx } else if (i < freqField) { fldi = fld[i] } else { fldi = fld[i-1] } printf "%s%s", (i == 1 ? "" : " "), fldi } printf "\n" } function frac_digits(ct) { # Number of nonzero fraction digits in {ct} if (match(ct, /[.]/)) { gsub(/^.*[.]/, "", ct) gsub(/[0]+$/, "", ct) return length(ct) } else { return 0 } } function int_digits(ct) { # Number of integer digits in {ct} gsub(/[.].*$/, "", ct) gsub(/^[0]+/, "", ct) if (ct == "") { return 1 } else { return length(ct) } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit abort; } function data_error(msg) { printf "stdin:%d: ** %s\n", nread, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit abort; }