#! /usr/bin/gawk -f
# Last edited on 2004-02-26 22:48:46 by stolfi
# Sampling functions for envg/wow
# English in Vigenere encoding - Well's "War of the Worlds".
# Only alpha words from the text, split at hyphens, mapped to lowercase.
# Note that hyphens are encoded as "~".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec,   i,ci,j,cj,shift,k,sb)
{
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
  
  # Alphabet for Vigenère encoding, and the modulus:
  vgalf = "abcdefghijklmnopqrstuvwxyz'";
  vgmod = length(vgalf);
  
  # Vigenère key and its period:
  vgkey = "ferrocyanide";
  vgper = length(vgkey);
  
  # Build the letter substitution table.
  # Letter {i+1} of {vgsub[c]} is the replacement for chracter {c} 
  # in positions that are congruent to {i} modulo {vgmod}.
  split("", vgsub);
  for (j = 0; j < vgmod; j++) 
    { cj = substr(vgalf, j+1, 1); vgsub[cj] = ""; }
  for (i = 0; i < vgper; i++)
    { # Find the {shift} to apply in positions congruent to {i}:
      ci = substr(vgkey, i+1, 1);
      shift = (int(vgmod/2) + i) % vgmod; # Just in case...
      for (j = 0; j < vgmod; j++)
        { cj = substr(vgalf, j+1, 1); if (ci == cj) { shift = j; break; } }
      # Add the corresponding letters to the substitution table:
      for (j = 0; j < vgmod; j++)
        { cj = substr(vgalf, j+1, 1); 
          vgsub[cj] = (vgsub[cj] substr(vgalf, ((j+shift) % vgmod) + 1, 1)); 
        }
    }

  # Current position in key:
  vgpos = 0;
  
  # Debugging option:
  vgbug = 0;
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject subsections that are not main text:
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && ( wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_word(smp,sec,type,wd,   len,k,ck,i,j,cj,res)
{
  if (type == "a")
    { # Map word to lowercase: 
      wd = tolower(wd);
      # There are no accented letters in this sample.
      
      # Break at hyphens:
      gsub(/[~]/, " ", wd);
    }

  # Vigenère-encode each piece, and concatenate:
  len = length(wd); res = "";
  for (k = 1; k <= len; k++)
    { ck = substr(wd, k, 1);
      if (ck in vgsub)
        { # Apply substitution:
          cj = substr(vgsub[ck], vgpos+1, 1);
          res = (res cj);
          # Advance along key:
          vgpos++; if (vgpos >= vgper) { vgpos = 0; }
        }
      else
        { # Preserve character:
          res = (res ck);
        }
    }
  wd = (vgbug ? (res "·" wd) : res);
  return wd;
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, plus apostrophe.
  # Note that the Vigenère encoding may create two consecutive apostrophes. 
  if (vgbug) 
    { return (wd ~ /^[\'a-z·]+$/); }
  else
    { return (wd ~ /^[\'a-z]+$/); }
}