#! /usr/bin/gawk -f 
# Last edited on 2004-10-04 15:02:50 by stolfi

BEGIN { 

  usage = ( ARGV[0] " -v order=NUM < SAMPLE.tks > SAMPLE.pfr" );

  # Reads a text, one token per line. Words must not contain spaces or
  # "_". Outputs a table of Markov transitions, in the format 
  #  "{COUNT} {FREQ} {STATE} {CHAR}"
  # The {STATE} is a string of {order} B-language chars, prefixed by "$".
  # The {FREQ} is relative to the total count for the {STATE}.
  # sorted by {CHAR} and then by decreasing {COUNT}. 
  # The table includes word breaks, denoted by "_".
  
  if (order == "") { arg_error("must define \"order\""); }
  if ((order < 0) || (order > 5)) { arg_error(("bad order = \"" order "\"")); }

  split("", ct);     # {stct[st,ch]} is the count for char {ch} after state {st}. 
  split("", totct);  # {totct[st]} is the total count of state {st}.
  split("", chus);   # {chus[ch]} is 1 iff character {ch} is used.
  
  # Initial state before each parag:
  inistate = substr("___________________________",1,order);
  
  # Current state:
  st = inistate;
  
  # Count successive blank lines:
  nblanklines = 1;
}

// {
  w = ($1 "_");
  # Replace each parag break by a string of {min{2,order}} "_"s
  if (length(w) == 1)
    { nblanklines++;
      # Ignore multiple blank lines:
      if (nblanklines > 1) { next; }
      # If {st} is nonempty, it already ends with exactly one "_", so:
      if (order > 2) { w = substr("___________________________",1,order-1); }
    }
  else
    { nblanklines = 0; }
  m = length(w);
  for (i = 1; i <= m; i++) 
    { ch = substr(w,i,1);
      ct[st,ch]++;
      totct[st]++;
      chus[ch] = 1;
      st = substr((st ch),2);
    }
  next;
}

END{
  if (length(st) != order) 
    { prog_error(("bad state = \"" st "\"")); }
  for (st in totct)
    for (ch in chus)
      if ((st,ch) in ct)
        { printf "%7d %8.6f $%s %s\n", \
            ct[st,ch], ct[st,ch]/totct[st], st, ch;
        }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function prog_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1
}