#! /usr/bin/gawk -f
# Last edited on 2004-03-01 03:51:30 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -v order=NUM \\\n" \
    "  -v tokens=NUM \\\n" \
    "  < INPUT.tks > OUTPUT.wfr " \
  );
  
  # Reads a text with arbitrary tokens separated by spaces and/or
  # newlines. Generates a new stream with {tokens} tokens, one per
  # line, by a character-based Markov generator of the given {order},
  # with transition probabilities equal to the frequencies observed in
  # the input text.
  # 
  # For the purpose of couting, each input token is terminated
  # implicitly by a single blank. Multiple consecutive spaces (even
  # across lines) are coalesced into one space. A "#" causes itself and
  # the rest of the line to be deleted. There must be at least
  # one non-blank, non-commented-out character in the input. 
  # 
  # Note that when {order = 0} the monkey may generate empty tokens
  # (blank lines). 
  
  if (order == "") { arg_error("must define \"order\""); }
  if (tokens == "") { arg_error("must define \"tokens\""); }
  
  # State counts:
  split("", cts); # {cts[w]} is the total obs of state {w}.
  
  # Observed transitions (arcs) and counts:
  split("", nao); # {nao[w]} = number of distinct arcs out of state {w}.
  split("", cho); # {cho[w,k]} = label of arc number {k} out of state {w}.
  split("", cto); # {cto[w,a]} = count of arc labeled {a} out of state {w}.
  
  # Current state (or part thereof):
  state = "";
  stlen = 0;  # Length of {state}
  
  # Initial uncounted characters, to be counted after EOF:
  unproc_chars = "";
  
  # Input statistics:
  ncounted = 0; # Total transitions counted so far
  
  printf "analyzing input file...\n" > "/dev/stderr";
}

(abort >= 0) { exit abort; }

# General line cleanup
// {
  # Discard any #-comment
  gsub(/[\#].*$/, "");
  
  # Regularize spaces, get rid of TABs:
  gsub(/[ \011]+/, " ");
  gsub(/^[ \011]+/, "");
  gsub(/[ \011]+$/, "");
}

# Process non-blank lines:
/./ {
  for (i = 1; i <= NF; i++)
    { # Append one space after each token:
      w = ($(i) " ");
      # Count transitions
      analyze(w);
    }
  next;
}

END {
  if (abort >= 0) { exit abort; }

  # We cannot do anything on an empty input:
  if ((ncounted == 0) && (unproc_chars == "")) 
    { arg_error("empty file -- monkey starved"); }
  
  while (stlen < order) 
    { # The input length was less than the automaton's order, so
      # {unproc_chars} is the whole input file, possibly 
      # replicated two or more times. Feed it again:
      analyze(unproc_chars);
    }
  # Now we have scanned the whole file an integer number {k>1} times, and 
  # all of it has been counted except the segment
  # in {unproc_chars}. The current {state} is now valid, 
  # namely the last {order} bytes of the file including 
  # the final space. Save it as the initial state:
  init_state = state;
  
  # Now scan the uncounted chars, to count them:
  analyze(unproc_chars);
  
  printf "%d characters counted\n", ncounted > "/dev/stderr";
  printf "generating the output text...\n" > "/dev/stderr";

  # Now start the monkey business.
  state = init_state;
  ntoks = 0;
  nchars = 0;
  tok = "";
  while (ntoks < tokens)
    { 
      # debug_state(state);
      c = pick_next_char(state);
      if (c == " ")
        { print tok; tok = "";
          ntoks++; 
        }
      else
        { tok = (tok c); }
      state = substr((state c), 2);
      nchars++;
    }
  printf "%d tokens (%d characters) generated.\n", ntoks, nchars > "/dev/stderr";
}

function analyze(w,  i,n,c,k)
{ 
  # Counts transitions for the input word {w} variables {state} and
  # {stlen}, and the tables {cts}, {cto}, {nao}, {cho}. However the 
  # first {order} characters of the input stream are not 
  # counted, and are instead saved in the variable {unproc_chars}.
  
  n = length(w);
  for (i = 1; i <= n; i++)
    { c = substr(w, i, 1);
      if (stlen == order)
        { # Count one more occurrence of {state}
          # Save/count word
          if (state in cts)
            { cts[state]++; }
          else
            { cts[state] = 1; 
              nao[state] = 0;
            }
          # Count one more transition {state,c}
          if ((state,c) in cto)
            { cto[state,c]++; }
          else
            { cto[state,c] = 1; 
              cho[state,nao[state]] = c;
              nao[state]++;
            }
          # Go to new state. Beware of the case {order=0}.
          state = substr((state c), 2); 
          # Count one more transition:
          ncounted++;
        }
      else
        { # Cannot count this character yet; just extend the state
          state = (state c); stlen++;
          # Save it as an uncounted character:
          unproc_chars = ( unproc_chars c );
        }
    }
}

function pick_next_char(state, n,k,c,sel,sum)
{
  # Generate a new character:
  sel = rand()*cts[state];
  sum = 0;
  n = nao[state];
  for (k = 0; k < n; k++)
    { c = cho[state,k];
      sum += cto[state,c];
      if (sum >= sel) { return c; }
    }
  # Just in case:
  return c;
}

function debug_state(state, n,k,c)
{
  printf "[%s] (%d) ->", state, cts[state] > "/dev/stderr";
  n = nao[state];
  for (k = 0; k < n; k++)
    { c = cho[state,k];
      printf " (%s:%d)", c, cto[state,c] > "/dev/stderr";
    }
  printf "\n" > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}