#! /usr/bin/gawk -f # Last edited on 2008-06-15 21:02:36 by stolfi BEGIN { USAGE = ( \ "format-multi-counts \\\n" \ " [ -v titles=\"{T[1]} {T[2]} ... {T[n]} {TW}\" ] \\\n" \ " [ -v totals={BOOL} ] \\\n" \ " [ -v freqs={BOOL} | -v cumFreqs={BOOL} | -v remFreqs={BOOL} ] \\\n" \ " [ -v colSep={STRING} ] \\\n" \ " [ -v width={NUM} ] \\\n" \ " [ -v widths=\"{W1} {W2} ... {Wn}\" ] \\\n" \ " [ -v maxLines={NUM} ] \\\n" \ " < {INFILE} > {OUTFILE}" \ ); # The {INFILE} should be a file with records in the format # "{F[1]} {F[2]} ... {F[n]} {WORD}" where each {F[i]} is a number # (except that "." is accepted as meaning 0). # # Outputs a fixed-format listing " {F[1]} {F[2]} ... {F[n]} {WORD}" # with each field {F[i]} right-justified and {WORD} left-justified. # # Blank lines and '#'-comments are discarded from the input. # # If the string "titles" is specified and non-empty, prints its # words as column headers. # # If {widths} is specified, prints the each field {F[i]} with {W[i]} # columns, right-justified. If {W[i]} is not specified for some # column, it defaults to {width}, which in turn defaults to 7. The # {WORD} is always printed left-justified. # # The column separator is the {colSep} parameter (default " "). # # A line with {WORD="TOTAL"} is printed if {totals} is set. # If {freqs} is set, prints the ratio of each {F[i]} to the column # total instead of {F[i]}. If {cumFreqs} is set, the ratio is # computed from the sum of all {F[i]} in the columns up to this one # (inclusive). If {remFreqs} is set, the ratio is computed from the # sum of all counts following this one (exclusive). # # For {totals}, {freqs}, {cumFreqs}, and {remFreqs}, the # fields {F[i]} had better be numeric. On output, # any {F[i]} that is 0 is printed as ".". # # If {maxLines} is set, tabulates only the first {maxLines} rows # of {INFILE}, and replaces any additional lines by a line # with {WORD=="..."} and every {F[i]=="."}. However, the # totals and frequencies are always computed relative to # the whole input file. # # WARNING: the {WORD} field should not contain any spaces. abort = -1; # Check the arguments and provide defaults: if (width == "") { width = 7; } else { width += 0; } if (colSep == "") { colSep = " "; } if (maxLines == "") { maxLines = -1; } else { maxLines += 0; } if (freqs == "") { freqs = 0; } else { freqs += 0; } if (cumFreqs == "") { cumFreqs = 0; } else { cumFreqs += 0; } if (remFreqs == "") { remFreqs = 0; } else { remFreqs += 0; } if (cumFreqs || remFreqs) { freqs = 1; } if (cumFreqs && remFreqs) { arg_error(("options {cumFreqs} and {remFreqs} are exclusive")); } # Parse the {widths} string, set {wd[1..nw]}. if (widths !~ /^[0-9 ]*$/) { arg_error(("bad {widths} argument")); } nw = split(widths, wd); # Parse the {titles} string, set {nh} and {hd[1..nh]}. nh = split(titles, hd); if (nh < 1) { arg_error("bad {titles} argument"); } # Expand {wd[1..nh-1]} to account for title widths: for (i = 1; i <= nh-1; i++) { while (i < nw) { nw++; wd[nw] = width; } wdi = length(hd[i]); if (wdi > wd[i]) { wd[i] = wdi; } } split("", tot); split("", ct); split("", key); nl = 0; # Number of lines in input file. nf = nh; # Number of fields in output (including the {WORD}). } (abort >= 0) { exit abort; } / / { gsub(/[\011\014\015]/, " ", $0); gsub(/[\#].*$/, "", $0); gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); } /./ { if (nf > 0) { if (NF != nf) { data_error(("wrong num of fields nh = " nh " NF = " NF "\n" $0)); } } else { nf = NF; } nl++; for (i = 1; i <= nf-1; i++) { fi = $(i); tot[i] += (fi == "." ? 0 : fi); # Stop at {maxLines}, unless {maxLines} is negative: if ((maxLines < 0) || (nl <= maxLines)) { # Save the field: ct[nl,i] = fi; # Expand {wd[i]} to accomodate the field: while (nw < i) { nw++; wd[nw] = width; } wdi = length(fi); if (wdi > wd[i]) { wd[i] = wdi; } } } key[nl] = $(nf); next; } END { if (abort >= 0) { exit abort; } if (freqs) { # Set {den[i]} to the denominator of freqs in column {i}: split("", den); for (i = 1; i < nf; i++) { den[i] = tot[i]; } # If {cumFreqs|remFreqs}, set {num[i]} to the initial numerator of freqs in column {i}: split("", num); for (i = 1; i < nf; i++) { den[i] = tot[i]; if (cumFreqs || remFreqs) { num[i] = ( remFreqs ? den[i] : 0 ); } } } if (totals) { # Adjust column widths for totals: for (i = 1; i < nf; i++) { wdi = length((freqs ? "999" : tot[i])); if (wdi > wd[i]) { wd[i] = wdi; } } } if (nh > 0) { # Print titles and dashes: for (i = 1; i <= nf-1; i++) { pr(i, (i < nh ? hd[i] : "?")); } printf "%s\n", hd[nh]; for (i = 1; i <= nf-1; i++) { pd(i); } wdw = length(hd[nh]); for (j = 0; j < wdw; j++) { printf "-"; } printf "\n"; } # Decide the number of lines to print: if (maxLines < 0) { maxLines = nl+1; } ml = (nl > maxLines ? maxLines+1 : nl); for(k = 1; k <= ml; k++) { # Print line {k}: for (i = 1; i < nf; i++) { ctF = ct[k,i]; # Formatted count ctN = (ctF == "." ? 0 : ctF + 0); # Numeric count if (freqs) { if (cumFreqs) { num[i] += ctN; } else if (remFreqs) { num[i] -= ctN; } else { num[i] = ctN; } if (num[i] == 0) { ctF = "."; } else { ctF = int(num[i]*999.999/den[i]); } } else if (ctN == 0) { ctF = "."; } if (k <= maxLines) { pr(i, ctF); } else if (k == maxLines+1) { pr(i, "."); } } if (k <= maxLines) { printf "%s\n", key[k]; } else if (k == maxLines+1) { printf "%s\n", "..."; } } if (totals) { for (i = 1; i < nf; i++) { if (freqs) { pr(i, "999"); } else { pr(i, tot[i]); } } printf "%s\n", "TOTAL"; } } function pr(i, str, w) { # Prints string "str" on column "i", right-justified, # followed by the column separator. w = (i > nw ? width : wd[i]); printf "%*s%s", w, str, colSep; } function pd(i, w,j) { # Prints a bunch of dashes on column "i" # followed by the column separator. w = (i > nw ? width : wd[i]); for (j = 0; j < w; j++) { printf "-"; } printf "%s", colSep; } function arg_error(msg) { printf "format-multi-counts: **%s\n", msg > "/dev/stderr"; printf "usage: %s\n", USAGE > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit 1; }