#! /usr/bin/gawk -f # Last edited on 2004-03-01 03:51:30 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -v order=NUM \\\n" \ " -v tokens=NUM \\\n" \ " < INPUT.tks > OUTPUT.wfr " \ ); # Reads a text with arbitrary tokens separated by spaces and/or # newlines. Generates a new stream with {tokens} tokens, one per # line, by a character-based Markov generator of the given {order}, # with transition probabilities equal to the frequencies observed in # the input text. # # For the purpose of couting, each input token is terminated # implicitly by a single blank. Multiple consecutive spaces (even # across lines) are coalesced into one space. A "#" causes itself and # the rest of the line to be deleted. There must be at least # one non-blank, non-commented-out character in the input. # # Note that when {order = 0} the monkey may generate empty tokens # (blank lines). if (order == "") { arg_error("must define \"order\""); } if (tokens == "") { arg_error("must define \"tokens\""); } # State counts: split("", cts); # {cts[w]} is the total obs of state {w}. # Observed transitions (arcs) and counts: split("", nao); # {nao[w]} = number of distinct arcs out of state {w}. split("", cho); # {cho[w,k]} = label of arc number {k} out of state {w}. split("", cto); # {cto[w,a]} = count of arc labeled {a} out of state {w}. # Current state (or part thereof): state = ""; stlen = 0; # Length of {state} # Initial uncounted characters, to be counted after EOF: unproc_chars = ""; # Input statistics: ncounted = 0; # Total transitions counted so far printf "analyzing input file...\n" > "/dev/stderr"; } (abort >= 0) { exit abort; } # General line cleanup // { # Discard any #-comment gsub(/[\#].*$/, ""); # Regularize spaces, get rid of TABs: gsub(/[ \011]+/, " "); gsub(/^[ \011]+/, ""); gsub(/[ \011]+$/, ""); } # Process non-blank lines: /./ { for (i = 1; i <= NF; i++) { # Append one space after each token: w = ($(i) " "); # Count transitions analyze(w); } next; } END { if (abort >= 0) { exit abort; } # We cannot do anything on an empty input: if ((ncounted == 0) && (unproc_chars == "")) { arg_error("empty file -- monkey starved"); } while (stlen < order) { # The input length was less than the automaton's order, so # {unproc_chars} is the whole input file, possibly # replicated two or more times. Feed it again: analyze(unproc_chars); } # Now we have scanned the whole file an integer number {k>1} times, and # all of it has been counted except the segment # in {unproc_chars}. The current {state} is now valid, # namely the last {order} bytes of the file including # the final space. Save it as the initial state: init_state = state; # Now scan the uncounted chars, to count them: analyze(unproc_chars); printf "%d characters counted\n", ncounted > "/dev/stderr"; printf "generating the output text...\n" > "/dev/stderr"; # Now start the monkey business. state = init_state; ntoks = 0; nchars = 0; tok = ""; while (ntoks < tokens) { # debug_state(state); c = pick_next_char(state); if (c == " ") { print tok; tok = ""; ntoks++; } else { tok = (tok c); } state = substr((state c), 2); nchars++; } printf "%d tokens (%d characters) generated.\n", ntoks, nchars > "/dev/stderr"; } function analyze(w, i,n,c,k) { # Counts transitions for the input word {w} variables {state} and # {stlen}, and the tables {cts}, {cto}, {nao}, {cho}. However the # first {order} characters of the input stream are not # counted, and are instead saved in the variable {unproc_chars}. n = length(w); for (i = 1; i <= n; i++) { c = substr(w, i, 1); if (stlen == order) { # Count one more occurrence of {state} # Save/count word if (state in cts) { cts[state]++; } else { cts[state] = 1; nao[state] = 0; } # Count one more transition {state,c} if ((state,c) in cto) { cto[state,c]++; } else { cto[state,c] = 1; cho[state,nao[state]] = c; nao[state]++; } # Go to new state. Beware of the case {order=0}. state = substr((state c), 2); # Count one more transition: ncounted++; } else { # Cannot count this character yet; just extend the state state = (state c); stlen++; # Save it as an uncounted character: unproc_chars = ( unproc_chars c ); } } } function pick_next_char(state, n,k,c,sel,sum) { # Generate a new character: sel = rand()*cts[state]; sum = 0; n = nao[state]; for (k = 0; k < n; k++) { c = cho[state,k]; sum += cto[state,c]; if (sum >= sel) { return c; } } # Just in case: return c; } function debug_state(state, n,k,c) { printf "[%s] (%d) ->", state, cts[state] > "/dev/stderr"; n = nao[state]; for (k = 0; k < n; k++) { c = cho[state,k]; printf " (%s:%d)", c, cto[state,c] > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }