#! /usr/bin/gawk -f
# Last edited on 1998-07-15 23:22:48 by stolfi

BEGIN {
  usage = ( "compute-cond-tuple-info \\\n" \
            "  -v order=ORDER \\\n" \
            "  -v place=PLACE \\\n" \
            "  [ -v alphabetSize=NUM ] \\\n" \
            "  < COUNTFILE > INFOTBL" \
          );

  # The file COUNTFILE must have entries COUNT WORD where COUNTS
  # is a positive occurrence count for the string WORD in some text.
  # The length of all WORDs must be ORDER.
  #
  # Computes the conditional information content for the PLACEth
  # character of each WORD in COUNTFILE, assuming the rest of WORD is
  # known.  Outputs entries INFO WORD for each input WORD.
  #
  # The optional "alphabetSize" value is used as the number of
  # possible categories in probability estimation.  If given, the
  # input should not have more than that number of distinct letters in
  # the PLACEth position of all tuples.  If not given, the actual
  # number of distinct letters in that position is used instead.
  #
  # Also prints to "stderr" a summary of the data.

  abort = -1;
  if (order == "") { error("should define \"order\""); } 
  if ((order < 1) || (order > 20)) { error("funny \"order\""); } 
  if (place == "") { error("should define \"place\""); } 
  if ((place < 1) || (place > order)) { error("funny \"place\""); } 
  if (alphabetSize == "") { alphabet_size = 0; } 
  if ((alphabetSize < 0) || (alphabetSize > 168))
    { error("funny \"alphabetSize\""); } 

  split("", full_count);
  split("", part_count);
  split("", letter_count);
  n_letters = 0;
  tot_count = 0;
  n_tuples = 0;
}

// {
  if (abort >= 0) { exit(abort); }
  if (NF != 2) { error(("line " NR ": format error")); }
  c = $1;
  w = $2;
  if (length(w) != order) { error(("line " NR ": wrong word length")); }
  if (w in full_count) { error(("line " NR ": tuple \"" w "\" repeated")); }
  full_count[w] = c;
  p = ( substr(w,1,place-1) substr(w, place+1, order-place) );
  part_count[p] += c;
  a = substr(w,place,1);
  if(! (a in letter_count)) { n_letters ++; }
  letter_count[a] += c;
  tot_count += c;
  n_tuples ++;
}

END {
  if (abort >= 0) { exit(abort); }
  # Print entropy table
  if (alphabet_size == 0) 
    { n_cats = n_letters; }
  else 
    { n_cats = alphabet_size;
      if (n_letters > alphabetSize) 
        { error(("true alphabet size = " n_letters)); }
    }
  scale = -1/log(2.0);
  tot_info = 0;
  for(w in full_count) 
    { p = (substr(w,1,place-1) substr(w,place+1,order-place));
      a = substr(w,place,1);
      cf = full_count[w]; cp = part_count[p];
      # Assume that, "a priori", all histograms are equally likely:
      fr = (cf + 1)/(cp + n_cats);
      info = scale*log(fr);
      printf "%7.3f %s\n", info, w;
      tot_info += cf*info;
    }
  # print summary
  print_summary(tot_info, n_tuples, order, place, alphabet_size, n_letters);
}

function print_summary(tot_info, n_tuples, order, place, a_nom, a_obs,   err,nch,hmed,left,right)
{
  err = "/dev/stderr";
  hmed = tot_info/tot_count;
  left = place-1 ; right = order - place;
  nch = tot_count + order - 1;

  printf "# full text: %d characters (including fillers)\n", nch > err;
  printf "# context: left = %d right = %d  (order = %d)\n", left, right, order > err;
  printf "# alphabet size:" > err;
  if (a_nom != 0) { printf " nominal = %d", a_nom > err; }
  printf " actual = %d\n", a_obs > err;
  printf "# mean conditional entropy h_%d: %6.3f bits/character\n", order, hmed > err;
}

function error(msg)
{ printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 0;
}