#! /usr/bin/gawk -f 
# Last edited on 1999-07-28 01:42:43 by stolfi

BEGIN{
  ok = 1;
  usage = "compute-lang-index -v coeffs=CFILE < SFILE";
  #
  # where each CFILE line has a signed coefficient and a 
  # word from the reference dictionary, and SFILE contains
  # the sample text, one word per line.
  #
  # After reading both files, prints to stdout two numbers
  #
  #   count = number of words in SFILE that are listed 
  #           in CFILE;
  #
  #   index = sum of coefficients of those words, divided
  #           by their count.
  #
  #   error = nominal uncertainty of the "index" value.
  #
  
  if (coeffs == "") { error(("usage: " usage)); }
  split("", c);
  split("", count);
  sizeD = 0;
  sumC2 = 0;
  while ((getline < coeffs) > 0)
    { sizeD++;
      if (NF != 2) { error((coeffs ", line " sizeD "\": bad format")); }
      cf = $1;
      w = $2;
      c[w] = cf;
      sumC2 += (cf * cf);
      count[w] = 0;
    }
  if (ERRNO != "0") { error((coeffs ": " ERRNO)); }
  close(coeffs);
  sizeSD = 0;
}

/./{
  if (! ok) { exit(1); }
  if (NF != 1) { error(("line " NR ": bad word")); }
  w = $1;
  if (w in c)
    { sizeSD++;
      count[w]++;
    }
  next;
}

END{
  if (! ok) { exit(1); }
  sumCF = 0;
  for (w in c)
    { fr = (count[w]+1)/(sizeSD + sizeD);
      sumCF += c[w]*fr;
    }
  scale = sqrt(sumC2);
  printf "%7d %+8.5f %7.5f\n", sizeSD, sumCF/scale, sqrt((1.0/6.0)/(sizeSD + sizeD));
}

function error(msg)
{
   printf "%s\n", msg > "/dev/stderr"; 
   ok=0; exit(1);
}