#! /usr/bin/gawk -f
# Last edited on 2025-05-04 16:57:00 by stolfi

BEGIN {
  abort = -1;
  usage = ( "compute_token_entropy_profile.gawk \\\n" \
    "  < INFILE > OUTFILE" \
  );

  # Assumes the input records have fields
  # 
  #   COUNT ENTROPY PREFIX 
  # 
  # where 
  #  
  #   PREFIX is an EVA lexeme prefix, with capitalized ligatures and elements
  #   marked off by {}, and prefixed with a word-start marker {_}
  #   
  #   COUNT is the number of tokens that have that prefix;
  #   
  #   ENTROPY is the conditional entropy of the next symbol (including
  #   word stop) following that PREFIX. 
  # 
  # The program combines the data for all prefixes of the same 
  # length (defined as number of {}-delimited elements),
  # and writes out for them a single record with the format
  # 
  #   CHARPOS POSCOUNT FREQ AVENTROPY WTENTROPY
  # 
  # where
  #
  #   CHARPOS is a character position index within the lexeme, 
  #   starting with 1;
  # 
  #   POSCOUNT is the number of tokens that have 
  #   at least CHARPOS-1 elements.  It is equal to
  #   the sum of COUNT for all PREFIXes with 
  #   CHARPOS elements (including the initial "{_}").
  # 
  #   FREQ is the fraction of those tokens, relative to
  #   all tokens.
  # 
  #   AVENTROPY is the average entropy of the CHARPOSth
  #   element of a token (including word-stop), computed
  #   only among the tokens that have at least CHARPOS-1
  #   elements.  It is equal to sum of ENTROPY*COUNT
  #   for all PREFIXes with CHARPOS elements (including "{_}"),
  #   divided by POSCOUNT.
  # 
  #   WTENTROPY is the same, weighted by FREQ.
  #
  # Note that the sum of WTENTROPY for all CHARPOS 
  # must be equal to the token entropy.
  
  # indexed by character position:
  split("", posCt);    # `posCt[k]' = tot count of prefixes with `k' elems.
  split("", posCtEnt); # `posCtEnt[k]' = sum of COUNT*ENTROPY for those prefixes.
  maxPos = 0;
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { next; }

/./ { 
  if (NF != 3) { data_error("bad line format"); }
  ct = $1; ent = $2; pref = $3;
  if (pref !~ /^[{}_a-zA-Z?]+$/) { data_error("bad prefix"); }
  w = pref;
 
  # split prefix into elements, and compute the next char position:
  gsub(/[}][{]/, "} {", w);
  pos = split(w, welem, " ");

  # Element consistency check:
  if (welem[1] != "{_}") { data_error("no leading {_}"); }
  for (i = 2; i <= pos; i++) 
    { e = welem[i];
      if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); }
    }
    
  # Tally entry:
  posCt[pos] += ct;
  posCtEnt[pos] += ent*ct;
  if (pos > maxPos) { maxPos = pos; }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  totCt = posCt[1];
  for (pos = 1; pos <= maxPos; pos++)
    { ct = posCt[pos];
      if (ct > totCt) { data_error("inconsistent prefix counts"); }
      freq = ct/totCt;
      avEnt = posCtEnt[pos]/ct;
      wtEnt = posCtEnt[pos]/totCt;
      printf "%3d %7d %7.5f %6.3f %6.3f\n", pos, ct, freq, avEnt, wtEnt;
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}

function table_error(msg)
{ 
  printf "error in elem table: %s\n", msg > "/dev/stderr";
  abort = 1; exit 1;
}