#! /usr/bin/gawk -f
# Last edited on 2000-02-03 12:02:38 by stolfi

# Reads from stdin a bunch of pairs of the form PROB WORD,
# one per line (such as produced by "uniq -c"), where the 
# same WORD may occur several times.  Adds all PROBs for
# the same WORD, and writes the resulting TOTPROB WORD pairs 
# to standard output, in some order.

BEGIN {
  abort = -1;
  usage = "combine-probs < INFILE > OUTFILE";
  if (ARGC != 1) { error(("usage: " usage)); }
  split("", pr);
}

(abort >= 0) { exit abort; }

/./ { 
  if (! match($0, /^[ \t]*[0-9]*[.][0-9]*[ \t]+/)) 
    { error(("line " NR ": bad format \"" $0 "\"")); }
  p = $1;
  w = substr($0, RLENGTH+1);
  pr[w] += p;
}

END {
  if (abort >= 0) { exit abort;} 
  for (w in pr)
    { if (pr[w] >= 0.000005) { printf "%7.5f %s\n", pr[w], w; } }
}

function error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}