#! /usr/bin/gawk -f # Last edited on 2025-04-30 02:48:50 by stolfi # Reads from stdin a bunch of pairs of the form {COUNT} {WORD}, # one per line (such as produced by "uniq -c"), where the # same {WORD} may occur several times. Adds all {COUNT}s for # the same {WORD, and writes the resulting {TOTCOUNT} {WORD} pairs # to standard output, in some order. BEGIN { abort = -1; FS = " "; USAGE = "combine-counts < {INFILE} > {OUTFILE}"; frac_size = 0; # Number of digits to print after decimal point. } (abort >= 0) { exit abort; } # General cleanup: // { gsub(/[\011\014\015]/, " ", $0); gsub(/[#].*$/, "", $0); gsub(/[ ]+$/, "", $0); gsub(/^[ ]+/, "", $0); } # Skip blank lines (including '#'-comments): /^ *$/ { next; } // { # Slit line into count {n} and word {w}: if (! match($0, /^[0-9]*([0-9]|[0-9][.]|[.][0-9])[0-9]*[ ]+/)) { data_error(("bad line format")); } n = $1; w = substr($0, RLENGTH+1); gsub(/^[ ]+/, "", w); # Update the total counts: ct[w] += n; # Update {frac_size}: if (match(n, /[.]/)) { m = length(n) - RSTART; if (m > frac_size) { frac_size = m; } } } END { if (abort >= 0) { exit abort; } # Choose the field width to output: pmax = 1; # Least non-negative power of 10 greater than all counts. for (w in ct) { while (pmax <= ct[w]) { pmax *= 10; } } int_size = length(pmax-1); # max number of digits before point. if (int_size < 7) { int_size = 7; } # For compatibility with {uniq -c}. for (w in ct) { if (frac_size > 0) { printf "%*.*f %s\n", int_size+1+frac_size, frac_size, ct[w], w; } else { printf "%*d %s\n", int_size, ct[w], w; } } } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit 1; }