#! /usr/bin/gawk -f
# Last edited on 2000-10-05 00:29:29 by stolfi

# Prints to stdout a table of digraph occurences in stdin.
# Also computes several derived tables (probabilities, entropies, etc.).
#
# Line breaks are ignored. The file is implicitly prefixed by a 
# user-given char, but is not padded at the end.  Thus, if
# N bytes are read, N digraphs are counted.

# Externally defined parameters (with "-v VAR=VALUE"): 
#   pad              the character implicitly prefixed to the file.
#   chars            string specifying the row and column ordering.
#   showentropy      1 to print entropy tables.
#   showstrangeness  1 to print strangeness/normalness tables.

# Global variables:
#   n          total digraph count
#   xn[x]      num occurrences of letter x
#   xyn[x,y]   num occurences of digraph xy

function checksym(x,  z)
{
  # Checks whether character "x" is expected. 
  # If not, prints warning.
  # Also allocates the table if this is the first occurence.
  if (!(x in xn))
    { for (z in xn)
        { xyn[x,z] = 0; xyn[z,x] = 0; }
      xn[x] = 0
    }
  if (!(x in mark))
    {
      printf "Warning: undeclared character '%s'\n", x > "/dev/stderr";
      mark[x] = 1;
    }
}

function setmarks(chars, mark,   i,x)
{
  # Sets mark[x] = true for every letter in chars
  split("", mark)
  for(i=1;i<=length(chars);i++)
    { x = substr(chars,i,1); 
    if (x in mark) { printf "duplicate char '%s'\n", x; exit 1 }
      mark[x] = 1;
    }
}

BEGIN { 
    n = 0; 
    split("", xn); 
    setmarks(chars, mark);
    if (length(pad) != 1) 
      { printf "invalid padding char\"%s\"\n", pad > "/dev/stderr";
        ok = 0; exit 1
      }
  }

/^$/ { if(die) exit 1; next; }

/./ {
  if(die) exit 1; 
  m = length($0);
  y = pad; checksym(y);
  for(i=1;i<=m;i++)
    { x = y;
      y = substr($0,i,1); 
      checksym(y);
      xyn[x,y]++;
      xn[x]++;
      n++;
    } 
  pad = y;
}

function fixchars(chars, xt, rows,    temp, i, x)
{
  # Sets rows[x] = 1 for every character x  that has an entry in xt.
  # Returns a string consisting only of the defined characters, in the 
  # order given by "chars". Characters that do not occur in "chars" are 
  # appended at the end.
  
  temp = ""
  for (i=1;i<length(chars);i++) 
    { x = substr(chars,i,1); 
      if(x in xt)
        { temp = (temp x); rows[x] = 1 }
    }
  for(x in xt) 
    { if (! (x in rows)) 
        { rows[x] = 1; temp = (temp x); }
    }
  return temp;
}

function ptable(t, xt, yt, xyt, fmt, esz, chars,    \
    x, y, rows, nchars, i,j, temp)
{
  # Prints a simple table
  # Inputs:
  #   t      = total totalorum
  #   xt     = row totals
  #   yt     = column totals 
  #   xyt    = elements
  #   fmt    = format spec for elements
  #   esz    = formatted element size in bytes
  #   chars  = list of characters to use as row and colum indices
  
  # Reduce/complete "chars" to include all characters that occured one or more times:
  split("", rows);
  chars = fixchars(chars, xt, rows);
  nchars = length(chars)
  
  printf "   ";
  printf " %*s", esz, "TT";
  for(j=1;j<=nchars;j++)
    { y = substr(chars,j,1); printf " %*s", esz, y; }
  printf "\n";

  printf "   ";
  printf " %*.*s", esz, esz, "---------";
  for(j=1;j<=nchars;j++) printf " %*.*s", esz, esz, "---------";
  printf "\n";

  for(i=1;i<=nchars;i++)
    { 
      x = substr(chars,i,1);
      printf "  %s", x;
      printf " %*s", esz, sprintf(fmt, xt[x]);
      for(j=1;j<=nchars;j++) 
        { y = substr(chars,j,1);
          if (xyt[x,y] == 0) 
            { printf " %*s", esz, "."; }
          else
            { printf " %*s", esz, sprintf(fmt, xyt[x,y]); }
        }
      printf "\n";
    }

  printf "   ";
  printf " %*.*s", esz, esz, "---------";
  for(j=1;j<=nchars;j++) printf " %*.*s", esz, esz, "---------";
  printf "\n";

  printf "TOT";
  printf " %*s", esz, sprintf(fmt, t);
  for(j=1;j<=nchars;j++) 
    { y = substr(chars,j,1);
      printf " %*s", esz, sprintf(fmt, yt[y]); 
    }
  printf "\n";
}

function entropy(p)
{
  if (p == 0) 
    { return 0.0 }
  else
    { return (- p * log(p)/log(2.0)) }
}

function pscale(p,m)
{ return int((m*p)+ 0.5) }

function sscale(v,m)
{ return int((m*v) + (v > 0 ? 0.5 : -0.5)) }

function strangeness( \
  n, xk, yk, xyk, \
  fx,fy,fxy,fmax,fmin,fexp,tmax,tmin,tsum,texp \
)
{
  if ((xk == 0) || (yk == 0)) 
    { return 0 }
  else
    { fx = xk/n;
      fy = yk/n;
      fxy = xyk/n;
      fmax = (fx < fy ? fx : fy);
      fexp = fx*fy;
      fmin = 0;
      if (fxy <= fmin)
        { return -1 }
      else if (fxy >= fmax)
        { return +1 }
      else
        { tmax = (fmax - fxy)/(fmax - fexp);
          tmin = (fxy - fmin)/(fexp - fmin);
          tsum = (log(tmin) - log(tmax))/log(2.0);
          if ( tsum > 0 )
            { texp = exp(-2*tsum); return (1 - texp)/(1 + texp) }
          else
            { texp = exp( 2*tsum); return (texp - 1)/(texp + 1) }
        }
    }
}

function normalness(n, xk, yk, xyk,    str)
{ 
  str = strangeness(n, xk, yk, xyk);
  return 1 - str*str
}

END {
  printf "Digraph counts:\n";
  printf "\n";
  ptable(n, xn, xn, xyn, "%d", 5, chars)
  printf "\n";
  
  if (showstrangeness != 0)
    { printf "Strangeness (× 99):\n";
      printf "\n";
      for (x in xn) xr[x] = pscale((1+0)/2, 99);
      for (y in xn) yr[y] = pscale((1+0)/2, 99);
      for (x in xn)
        for (y in xn) 
          { xyr[x,y] = pscale((1+strangeness(n, xn[x], xn[y], xyn[x,y]))/2, 99); }
      ptable(50, xr, yr, xyr, "%d", 2, chars)
      printf "\n";

      printf "Normalness (× 99):\n";
      printf "\n";
      for (x in xn) xr[x] = pscale(1.000, 99);
      for (y in xn) yr[y] = pscale(1.000, 99);
      for (x in xn)
        for (y in xn) 
          { xyr[x,y] = pscale(normalness(n, xn[x], xn[y], xyn[x,y]), 99); }
      ptable(50, xr, yr, xyr, "d", 2, 2, chars)
      printf "\n";
    }

  if (1)
    { printf "Next-symbol probability (× 99):\n";
      printf "\n";
      for (x in xn) xp[x] = pscale(1.000, 99);
      for (y in xn) yp[y] = pscale(xn[y]/n, 99);
      for (x in xn)
        for (y in xn) 
          { xyp[x,y] = pscale(xyn[x,y]/xn[x], 99); }
      ptable(99, xp, yp, xyp, "%d", 2, chars)
      printf "\n";

      printf "Previous-symbol probability (× 99):\n";
      printf "\n";
      for (x in xn) xp[x] = pscale(xn[x]/n, 99);
      for (y in xn) yp[y] = pscale(1.000, 99);
      for (x in xn)
        for (y in xn) 
          { xyp[x,y] = pscale(xyn[x,y]/xn[y], 99); }
      ptable(99, xp, yp, xyp, "%d", 2, chars)
      printf "\n";
    }
    
  if (showentropy != 0)
    { 
      printf "Symbol entropy: "
      h = 0.000
      for (x in xn)
        { h += entropy(xn[x]/n); }
      printf "%.3f\n\n", h;
      
      printf "Next-symbol entropy:\n"
      printf "\n";
      for (x in xn) xh[x] = 0.000;
      for (y in xn) yh[y] = entropy(xn[y]/n);
      h = 0.000
      for (x in xn)
        { 
          for (y in xn)
            { xyh[x,y] = entropy(xyn[x,y]/xn[x]); 
              xh[x] += xyh[x,y];
            }
          h += xh[x] * (xn[x]/n);
        }
      ptable(h, xh, yh, xyh, "%.3f", 5, chars)
      printf "\n";

      printf "Previous-symbol entropy:\n"
      printf "\n";
      for (x in xn) xh[x] = entropy(xn[x]/n);
      for (y in xn) yh[y] = 0.000;
      h = 0.000
      for (y in xn)
        { 
          for (x in xn) 
            { xyh[x,y] = entropy(xyn[x,y]/xn[y]);
              yh[y] += xyh[x,y];
            }
          h += yh[y] * (xn[y]/n);
        }
      ptable(h, xh, yh, xyh, "%.3f", 5, chars);
      exit 0;
    }
}