#! /usr/bin/gawk -f
# Last edited on 2004-02-26 06:46:17 by stolfi
# Sampling functions for engl/wow
# English - main text from Well's "War of the Worlds".
# Only alpha words from the text, split at hyphens, mapped to lowercase.
# Note that hyphens are encoded as "~".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete all but ordinary chapters (omit opening quote)
  if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject subsections that are not main text:
  if (cursec !~ /{tx}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && ( wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case:
  wd = tolower(wd);
  # There are no accented letters in this sample.
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  return wd;
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha, plus apostrophe.
  # Apostrophes can't be doubled.
  # Note that 'tis OK to begin an' end with apostrophe! 
  return (wd ~ /^([\']?[a-z])+[\']?$/);
  #
  # The following allows hyphenated words.
  # Note that each word of 
  # an hyphenated compound must contain at least one letter.
  #   return (wd ~ /^(([']?[a-z])+[']?)([~]([']?[a-z])+[']?)*$/);
}