#! /usr/bin/gawk -f
# Last edited on 2000-09-21 16:39:18 by stolfi

BEGIN { 
  abort = -1;

  # Reads the run-length and line-length statistics, 
  # writes the expected run-length frequencies.
  # Assumes the input contains records of the form 
  #   COUNT STRING
  # where STRING is a run of all zeros or all ones, 
  # or a string of dots "." indicating a line length;
  # COUNT is the number of STRING occurrences seen.
  # Computes the character probabilities for "0" and "1",
  # and writes the relative run length frequencies of 0- and 1-
  # runs, for a random set of strings with those character frequencies
  # and those line lengths.

  maxlen = 1;
  maxlinlen = 1;
  split("", s);
  split("", nlines);
  totlines = 0;
  totchars = 0;
}

(abort >= 0) { exit abort; }

($2 ~ /^[.]+$/) {
  len = length($2);
  ct = $1 + 0;
  nlines[len] += ct;
  totlines += ct;
  totchars += ct * len;
  if (len > maxlinlen) { maxlinlen = len; }
  next;
}

($2 ~ /^[0]+$/) { tally($1 + 0, $2); next; }
($2 ~ /^[1]+$/) { tally($1 + 0, $2); next; }

/./ { data_error("bad run"); } 

function tally(ct, run,  c, m)
{
  c = substr(run,1,1);
  m = length(run);
  s[c] += ct * m;
  if (maxlen < m) { maxlen = m; }
}

END {
  if (abort >= 0) { exit abort; }
  
  # Print line length distribution, for checking:
  printf "line length distribution:\n";
  for (len = 0; len <= maxlinlen; len++)
    { printf "  %3d %5d\n", len, nlines[len]; }
  printf "total lines = %d\n", totlines;
  printf "\n";
    
  # Compute character probabilities:
  stot = s["0"]+s["1"]; 
  if (stot != totchars) 
    { prog_error(("inconsistency stot = " stot " totchars = " totchars)); }


  if (stot == 0) { stot = 1; }
  p0 = s["0"]/stot;
  p1 = s["1"]/stot;
  
  printf "s0 = %5d p0 = %7.5f\n", s["0"], p0;
  printf "s1 = %5d p1 = %7.5f\n", s["1"], p1;
  printf "\n";

  # Computes expected counts x0[k], x1[k] of runs of length k
  split("", x0);
  split("", x1);
  for (len1 in nlines)
    { m = nlines[len1];
      # Fucked-up gawk semantics: must add 0 to "len" to force numeric 
      # comparison in the for loop test "i <= len"!
      len = len1 + 0;
      # printf "len = %d nlines = %d\n", len, m;
      # Compute and tally the expected number of runs of length k
      # in a line of length len.
      split("", r0); r0[1] = p0;
      split("", r1); r1[1] = p1;
      for (i = 2; i <= len; i++)
        { # now r0[j], for 1 <= j < i, is the probability that 
          # a line with i-1 characters ends with a run of i-j 0's.
          # Ditto for r1[j].
          # Consider what happens if the ith character is 
          # different from the last one:
          r0[i] = 0; r1[i] = 0;
          for (j = 1; j < i; j++)
            { p01 = r0[j]*p1; 
              x0[i-j] = x0[i-j] + m*p01;
              r1[i] += p01;
              # printf "run 0^%02d: + %8.4f\n", i-j, m*p01; 
              p10 = r1[j]*p0; 
              x1[i-j] = x1[i-j] + m*p10;
              r0[i] += p10;
              # printf "run 1^%02d: + %8.4f\n", i-j, m*p10; 
            }
          # Now consider what happens if the ith character is the same:
          for (j = 1; j < i; j++) { r0[j] *= p0; r1[j] *= p1; }
        }
      # Now tally the last runs on the line:
      for (j = 1; j <= len; j++)
        { x0[len+1-j] += m*r0[j];
          x1[len+1-j] += m*r1[j];
        }
    }
  
  # Compute total expected run counts:
  totruns = 0;
  for (k in x0) { totruns += x0[k] + x1[k]; }
  if (totruns == 0) { totruns = 1; }
  
  # Now write the run frequencies:
  t0 = 0; t1 = 0;
  z0 = 0; z1 = 0;
  for(i = 1; i <= maxlen; i++)
    { f0 = x0[i]/totruns;
      f1 = x1[i]/totruns;
      printf "%2d  %8.2f %7.5f  %8.2f %7.5f\n", i, x0[i], f0, x1[i], f1; 
      t0 += f0; t1 += f1;
      z0 += i*f0; z1 += i*f1;
    }
  printf "\n"; 
  printf "   %7.5f %7.5f\n", t0, t1;
  ztot = z0 + z1;
  if (ztot == 0) { ztot = 1; }
  z0 /= ztot; z1 /= ztot;
  printf "   %7.5f %7.5f\n", z0, z1; 
}

function data_error(msg)
{
  printf "file %s, line %s: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}

function prog_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  abort = 1; exit abort;
}