#! /usr/bin/gawk -f # Last edited on 2004-02-26 22:48:46 by stolfi # Sampling functions for envg/wow # English in Vigenere encoding - Well's "War of the Worlds". # Only alpha words from the text, split at hyphens, mapped to lowercase. # Note that hyphens are encoded as "~". # To be included in wds-to-tlw function smp_define_patterns(smp,sec, i,ci,j,cj,shift,k,sb) { if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } # Alphabet for Vigenère encoding, and the modulus: vgalf = "abcdefghijklmnopqrstuvwxyz'"; vgmod = length(vgalf); # Vigenère key and its period: vgkey = "ferrocyanide"; vgper = length(vgkey); # Build the letter substitution table. # Letter {i+1} of {vgsub[c]} is the replacement for chracter {c} # in positions that are congruent to {i} modulo {vgmod}. split("", vgsub); for (j = 0; j < vgmod; j++) { cj = substr(vgalf, j+1, 1); vgsub[cj] = ""; } for (i = 0; i < vgper; i++) { # Find the {shift} to apply in positions congruent to {i}: ci = substr(vgkey, i+1, 1); shift = (int(vgmod/2) + i) % vgmod; # Just in case... for (j = 0; j < vgmod; j++) { cj = substr(vgalf, j+1, 1); if (ci == cj) { shift = j; break; } } # Add the corresponding letters to the substitution table: for (j = 0; j < vgmod; j++) { cj = substr(vgalf, j+1, 1); vgsub[cj] = (vgsub[cj] substr(vgalf, ((j+shift) % vgmod) + 1, 1)); } } # Current position in key: vgpos = 0; # Debugging option: vgbug = 0; } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject subsections that are not main text: if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, len,k,ck,i,j,cj,res) { if (type == "a") { # Map word to lowercase: wd = tolower(wd); # There are no accented letters in this sample. # Break at hyphens: gsub(/[~]/, " ", wd); } # Vigenère-encode each piece, and concatenate: len = length(wd); res = ""; for (k = 1; k <= len; k++) { ck = substr(wd, k, 1); if (ck in vgsub) { # Apply substitution: cj = substr(vgsub[ck], vgpos+1, 1); res = (res cj); # Advance along key: vgpos++; if (vgpos >= vgper) { vgpos = 0; } } else { # Preserve character: res = (res ck); } } wd = (vgbug ? (res "·" wd) : res); return wd; } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, plus apostrophe. # Note that the Vigenère encoding may create two consecutive apostrophes. if (vgbug) { return (wd ~ /^[\'a-z·]+$/); } else { return (wd ~ /^[\'a-z]+$/); } }