#! /usr/bin/gawk -f
# Last edited on 2025-04-30 02:48:50 by stolfi

# Reads from stdin a bunch of pairs of the form {COUNT} {WORD},
# one per line (such as produced by "uniq -c"), where the 
# same {WORD} may occur several times.  Adds all {COUNT}s for
# the same {WORD, and writes the resulting {TOTCOUNT} {WORD} pairs 
# to standard output, in some order.

BEGIN {
  abort = -1;
  FS = " ";
  USAGE = "combine-counts < {INFILE} > {OUTFILE}";
  frac_size = 0; # Number of digits to print after decimal point.
}

(abort >= 0) { exit abort; }

# General cleanup: 
// { 
  gsub(/[\011\014\015]/, " ", $0);
  gsub(/[#].*$/, "", $0);
  gsub(/[ ]+$/, "", $0);
  gsub(/^[ ]+/, "", $0);
}

# Skip blank lines (including '#'-comments): 
/^ *$/ { next; }

// { 
  # Slit line into count {n} and word {w}:
  if (! match($0, /^[0-9]*([0-9]|[0-9][.]|[.][0-9])[0-9]*[ ]+/)) 
    { data_error(("bad line format")); }
  n = $1;
  w = substr($0, RLENGTH+1);
  gsub(/^[ ]+/, "", w);
  
  # Update the total counts:
  ct[w] += n;

  # Update {frac_size}: 
  if (match(n, /[.]/))
    { m = length(n) - RSTART;
      if (m > frac_size) { frac_size = m; }
    }
}

END {
  if (abort >= 0) { exit abort; }
  # Choose the field width to output:
  pmax = 1; # Least non-negative power of 10 greater than all counts.
  for (w in ct)
    { while (pmax <= ct[w]) { pmax *= 10; } }
  int_size = length(pmax-1); # max number of digits before point.
  if (int_size < 7) { int_size = 7; } # For compatibility with {uniq -c}.
  
  for (w in ct)
    { if (frac_size > 0)
        { printf "%*.*f %s\n", int_size+1+frac_size, frac_size, ct[w], w; }
      else
        { printf "%*d %s\n", int_size, ct[w], w; }
    }
}

function data_error(msg)
{
  printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
  abort = 1; exit 1;
}