#! /usr/bin/gawk -f

# Prints to stdout a table of pattern classifications, produced
# by compare-freqs and compute-count-ratio.
# Each line of input should have the format  
#   "NT FT NL FL rat mk patt"
# where NT,NL are counts, FT,FL are frequancies,
# rat is the ratio NL/NT, patt is a pattern of the
# form "x:y", and mk is a two-byte class marker.

# Externally defined parameters (with "-vVAR=VALUE"): 
#   colchars   string specifying column ordering for single-char right patterns
#   rowchars   string specifying row ordering for single-char left patterns

# Global variables:
#   xn[x]      1 if string x occurred on left side
#   yn[y]      1 if string y occurred on right side
#   xyn[x,y]   classification of pattern x:y
#   maxlen     max length of left or right side of pattern (min 2)

BEGIN {
  n = 0; 
  split("", xn); clearcounts(rowchars, xn);
  split("", yn); clearcounts(colchars, yn);
  maxlen = 2
}

function clearcounts(chars, zn,   i,z)
{
  # Sets zn[z] = 0 for every letter "z" in "chars"
  for(i=1;i<=length(chars);i++)
    { z = substr(chars,i,1); 
    if (z in zn) 
      { printf "duplicate char '%s'\n", x; n=-1; exit 1 }
        zn[z] = 0;
      }
}
         
function checkrow(x,  z, len)
{
  # Called for each input pair "x:y".
  # Checks whether left string "x" is known and sets "xn[x]" to 1..
  # Prints warning if "x" is a char and was not predeclared with "-v rowchars=..."
  # If this is the first occurrence of "x" in the file, initializes the row.
  len = length(x)
  if (len == 0) { printf "bad left string '%s'\n", $0 > "/dev/stderr"; exit 1 }
  if (len > maxlen) maxlen = len;
  if ((!(x in xn)) && (len == 1))
    { printf "Warning: undeclared row character '%s'\n", x;
      xn[x] = 0;
    }
  if (xn[x] == 0)
    {
      for (z in yn) { xyn[x,z] = ""; }
      xn[x] = 1
    }
}

function checkcol(y,  z, len)
{
  # Called for each input pair "x:y".
  # Checks whether right string "y" is known and sets "yn[y]" to 1..
  # Prints warning if "y" is a char and was not predeclared with "-v colchars=..."
  # If this is the first occurrence of "y" in the file, initializes the column.
  len = length(y)
  if (len == 0) { printf "bad right string '%s'\n", $0 > "/dev/stderr"; exit 1 }
  if (len > maxlen) maxlen = len;
  if ((!(y in yn)) && (length(y) == 1))
    { printf "Warning: undeclared col character '%s'\n", y;
      yn[y] = 0;
    }
  if (yn[y] == 0)
    {
      for (z in xn) { xyn[z,y] = ""; }
      yn[y] = 1
    }
}

/^[^#]/ {
  mk = $6;
  pt = $7;
  if(match(pt, /:/))
    { x = substr(pt,1,RSTART-1); checkrow(x);
      y = substr(pt,RSTART+1);   checkcol(y);
      xyn[x,y] = mk
    }
  else
    { printf "bad pattern %s\n", pt > "/dev/stderr"; exit 1 }
}

/^$/ { next; }

function fixchars(chars, zt,    temp, tmpt, i, z)
{
  # Returns a string consisting only of the characters "z" of "chars"
  # that have "zt[z] == 1", in the order given by "chars". 
  # Characters that have "zt[z] == 1" but do not occur in "chars"
  # are appended at the end.
  
  temp = ""
  split("", tmpt)
  for (i=1;i<=length(chars);i++) 
    { z = substr(chars,i,1); 
      if(! (z in zt))
        { printf "internal error #67221 %s\n" > "/dev/stderr"; exit 1 }
      else if(zt[z] == 0)
        { delete zt[z] }
      else if(zt[z] == 1)
        { temp = (temp z); tmpt[z] = 1 }
    }
  for(z in zt) { if ((length(z) == 1) && (! (z in tmpt))) { temp = (temp z); } }
  return temp;
}

function pelem(mk, esz)
{
  # Prints one entry of the mark table
  # Inputs:
  #   mk     = pattern classification
  #   esz    = formatted element size in bytes
  #   colch  = list of characters to use as colum indices

  printf " %*s", esz, (mk == "" ? "." : mk);
}

function prow(x, yt, xyt, esz, colch, \
    y, ncols, j)
{
  # Prints one row of the mark table
  # Inputs:
  #   x      = row label and index
  #   yt     = column indicators (0 = omit row)
  #   xyt    = pattern classifications
  #   esz    = formatted element size in bytes
  #   colch  = list of characters to use as colum indices

  ncols = length(colch)
  printf "%*s |", esz, x;
  for(j=1;j<=ncols;j++) { y = substr(colch,j,1); pelem(xyt[x, y], esz); }
  for(y in yt) { if (length(y) != 1) { pelem(xyt[x, y], esz); } }
  printf "\n";
}

function ptable(xt, yt, xyt, esz, rowch, colch,  \
    x, y, nrows, i,j, temp)
{
  # Prints the mark table
  # Inputs:
  #   xt     = row indicators (0 = omit row)
  #   yt     = column indicators (0 = omit row)
  #   xyt    = pattern classifications
  #   esz    = formatted element size in bytes
  #   rowch  = list of characters to use as row indices
  #   colch  = list of characters to use as colum indices
  
  # Reduce/complete "chars" to include all characters that occured one or more times:
  rowch = fixchars(rowch, xt);
  colch = fixchars(colch, yt);
  nrows = length(rowch)
  ncols = length(colch)
  
  printf "%*.*s  ", esz, esz, "         ";
  for(j=1;j<=ncols;j++) { y = substr(colch,j,1); printf " %*s", esz, y; }
  for(y in yt) { if(length(y) != 1) { printf " %*s", esz, y; } }
  printf "\n";

  printf "%*.*s  ", esz, esz, "         ";
  for(j=1;j<=ncols;j++) { printf " %*.*s", esz, esz, "---------"; }
  for(y in yt) { if(length(y) != 1) { printf " %*.*s", esz, esz, "---------"; } }
  printf "\n";

  for(i=1;i<=nrows;i++) { x = substr(rowch,i,1); prow(x, yt, xyt, esz, colch); }
  for(x in xt) { if (length(x) != 1) { prow(x, yt, xyt, esz, colch); } }
}

END {
  printf "Pattern classes:\n";
  printf "\n";
  ptable(xn, yn, xyn, maxlen, rowchars, colchars);
  printf "\n";
  exit 0;
}