#! /usr/bin/gawk -f
# Last edited on 2001-01-17 03:27:20 by stolfi

BEGIN {
  abort = -1;
  usage = ( "compute-hk-entropies \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # Assumes the input records have fields
  # 
  #   COUNT ENTROPY NGRAM 
  # 
  # where 
  #   
  #   NGRAM is a string of zero or more EVA elements, with capitalized
  #   ligatures and elements marked off by {}, and prefixed with a
  #   dummy marker "{...}"
  #   
  #   COUNT is the number of tokens that have that prefix;
  #   
  #   ENTROPY is the conditional entropy of the next symbol (including
  #   word stop) following that NGRAM. 
  # 
  # The program combines the data for all NGRAMs of the same 
  # length (defined as number of {}-delimited elements),
  # and writes out for them a single record with the format
  # 
  #   ORDER TOTCOUNT AVENTROPY
  # 
  # where
  #
  #   ORDER is the length of the NGRAMs that produced 
  #   this entry, starting with 1 (the "{}" marker).
  # 
  #   TOTCOUNT is the total count of those NGRAMs.
  # 
  #   AVENTROPY is the average entropy of the next symbol,
  #   given the ORDER-1 previous symbols.  It is equal to sum of ENTROPY*COUNT
  #   for all NGRAMs that contributed to this entry,
  #   divided by TOTCOUNT.
  
  # indexed by character position:
  split("", totCt);    # `totCt[k]' = tot count of ngrams with `k' elems.
  split("", totCtEnt); # `totCtEnt[k]' = sum of COUNT*ENTROPY for those ngrams.
  maxk = 0;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 3) { data_error("bad line format"); }
  ct = $1; ent = $2; ngram = $3;
  if (ngram !~ /^[{}_a-zA-Z?]+$/) { data_error("bad ngram"); }
  w = ngram;
 
  # split ngram into elements, and compute the element count `k':
  gsub(/[}][{]/, "} {", w);
  k = split(w, welem, " ");

  # Element consistency check:
  if (welem[1] != "{}") { data_error("no leading {}"); }
  for (i = 2; i <= k; i++) 
    { e = welem[i];
      if (e !~ /^[{][_a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); }
    }
    
  # Tally entry:
  totCt[k] += ct;
  totCtEnt[k] += ent*ct;
  if (k > maxk) { maxk = k; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  totCtPrev = totCt[1];
  for (k = 1; k <= maxk; k++)
    { ct = totCt[k];
      if (ct > totCtPrev) { data_error("inconsistent ngram counts"); }
      avEnt = totCtEnt[k]/ct;
      printf "%3d %7d %6.3f\n", k, ct, avEnt;
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function table_error(msg)
{ 
  printf "error in elem table: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}