#! /usr/bin/gawk -f # Last edited on 2004-02-26 18:29:04 by stolfi # Sampling functions for engn/wnm # English - proper names from Well's "War of the Worlds", # Split at hyphens, mapped to lowercase. # Note that hyphens are encoded as "~". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Keep only alpha words (possibly hyphenated). if (type != "a") { return "n"; } # Take only texts of ordinary chapters (omit opening quote). if (cursec !~ /^{p1}{c[1-9][0-9]*}{tx}$/) { return "n"; } # The {wmap} table should lowercase any # non-name words which happen to sentence-initial caps: if (wd in wmap) { wd = wmap[wd]; } # Keep only words with an uppercase letter followed by a lowercase one. if (wd !~ /[A-Z][a-z]/) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Assumes that {smp_reclassify_word} and the word-map table have already # taken care of non-names, including those in sentence-initial caps. # Map upper case to lower case: wd = tolower(wd); # Break at hyphens: gsub(/[~]/, " ", wd); return wd; } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha plus apostrophe. # Apostrophes can't be doubled. # Note that 'tis OK to begin an' end with apostrophe! return (wd ~ /^([\']?[a-z])+[\']?$/); }