#! /usr/bin/gawk -f
# Last edited on 2025-04-29 20:43:46 by stolfi

function error(msg)
{
   printf "%s\n", msg > "/dev/stderr"; 
   abort=1; exit(1);
}

function arg_error(msg)
{
   printf "%s\n", msg > "/dev/stderr"; 
   abort=1; exit(1);
}

BEGIN {
  abort = -1;
  printf "**" > "/dev/stderr";
  usage = "$0 -v dic=DICFILE < COUNTFILE > PROBFILE";
  if (ARGC != 1) { arg_error (("usage: " usage "\n"));  }

  # Reads a list of words DICFILE, in some specific order, and a file
  # of word counts, as produced by "uniq -c".  Outputs another file
  # PROBFILE whose lines have the form PROB WORD, where WORD
  # enumerates all the words of DICFILE, in their proper order, and
  # PROB is an estimate of the probability of WORD in the infinte
  # text.

  if (dic == "") { arg_error (("usage: " usage "\n"));  }
  split("", wd);
  split("", ct);
  sizeD = 0;
  ERRNO = ""
  while ((getline < dic) > 0)
    { wd[sizeD] = $1;
      sizeD ++;
      if (NF != 1) { arg_error((dic ", line " sizeD "\": bad format")); }
      ct[$1] = 0;
    }
  if (ERRNO != "") { arg_error((dic ": " ERRNO)); }
  printf "size(D) = %6d ", sizeD > "/dev/stderr";
  close(dic);
  sizeSD = 0;
}

(abort >= 0) { exit abort; }

/^#/ { next; }

/./ { 
  if (NF != 2) { error((counts ", line " NR "\": bad format")); }
  c = $1; w=$2; 
  if (w in ct) { ct[w] += c; sizeSD += c; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  printf "size(S.D) = %7d\n", sizeSD > "/dev/stderr";
  for (i=0; i < sizeD; i++)
    { w = wd[i]; c = ct[w];
      efr = ((c+1)/(sizeSD+sizeD));
      printf "%7.5f %s\n", efr, w;
    }
}