#! /usr/bin/gawk -f 
# Last edited on 2001-01-04 19:33:38 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "compute-cond-entropy < INFILE > OUTFILE" \
  );
  
  # Expects the input to have records of the form 
  # 
  #   COUNT EVENT SYMB
  # 
  # representing COUNT occurrences of SYMB after the specified EVENT.
  # The records must be sorted by WORD.
  # 
  # Outputs for each EVENT one line with format
  # 
  #   TOTCOUNT ENTROPY EVENT
  # 
  # where TOTCOUNT is the sum of all COUNT for the EVENT, and ENTROPY is the
  # conditional entropy of the next SYMB given that EVENT. Assumes that
  # all events are mutually exclusive, and all SYMB for each EVENT are
  # distinct.
  
  curEvent = "";
  curEventCt = 0;     # Total count for the current event.
  nSymb = 0;          # Count of symbols for this event.
  split("", symbCt);  # Counts of each symbol after the current event.
}

(abort >= 0) { exit abort; }

/./ {
  if (NF != 3) { data_error("bad field count"); }
  event = $2;
  if (event != curEvent)
    { totalize_event_entropy();
      curEvent = event;
    }
  count = $1;
  if (count !~ /^[+]*[.0-9]*[0-9][.0-9]*$/) { data_error("bad count format"); }
  symbCt[nSymb] = count; nSymb++;
  curEventCt += count;
  next;
}

END {
  totalize_event_entropy();
}

function totalize_event_entropy(   i, entropy)
{
  if (curEvent != "")
    { for (i = 0; i < nSymb; i++) { symbCt[i] /= curEventCt; }
      entropy = 0;
      for (i = 0; i < nSymb; i++)
        { if (symbCt[i] > 0) 
            { entropy += -symbCt[i]*log(symbCt[i]); }
        }
      entropy /= log(2.0);
      printf "%7d %8.4f %s\n", curEventCt, entropy, curEvent;
    }
  split("", symbCt);
  nSymb = 0;
  curEventCt = 0;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}