#! /usr/bin/gawk -f
# Last edited on 2004-02-26 18:29:04 by stolfi
# Sampling functions for engn/wnm
# English - proper names from Well's "War of the Worlds",
# Split at hyphens, mapped to lowercase.
# Note that hyphens are encoded as "~".
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  # No patterns needed 
  if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); }
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Keep only alpha words (possibly hyphenated).
  if (type != "a")  { return "n"; }
  
  # Take only texts of ordinary chapters (omit opening quote).
  if (cursec !~ /^{p1}{c[1-9][0-9]*}{tx}$/) { return "n"; }
  
  # The {wmap} table should lowercase any
  # non-name words which happen to sentence-initial caps:
  if (wd in wmap) { wd = wmap[wd]; }

  # Keep only words with an uppercase letter followed by a lowercase one.
  if (wd !~ /[A-Z][a-z]/)  { return "n"; }
  
  
  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Assumes that {smp_reclassify_word} and the word-map table have already 
  # taken care of non-names, including those in sentence-initial caps.
  # Map upper case to lower case:
  wd = tolower(wd);
  # Break at hyphens:
  gsub(/[~]/, " ", wd);
  return wd;
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept only lowercase alpha plus apostrophe.
  # Apostrophes can't be doubled.
  # Note that 'tis OK to begin an' end with apostrophe! 
  return (wd ~ /^([\']?[a-z])+[\']?$/);
}