#! /usr/bin/gawk -f # Last edited on 2004-02-26 06:46:17 by stolfi # Sampling functions for engl/wow # English - main text from Well's "War of the Worlds". # Only alpha words from the text, split at hyphens, mapped to lowercase. # Note that hyphens are encoded as "~". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject subsections that are not main text: if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # There are no accented letters in this sample. # Break at hyphens: gsub(/[~]/, " ", wd); return wd; } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, plus apostrophe. # Apostrophes can't be doubled. # Note that 'tis OK to begin an' end with apostrophe! return (wd ~ /^([\']?[a-z])+[\']?$/); # # The following allows hyphenated words. # Note that each word of # an hyphenated compound must contain at least one letter. # return (wd ~ /^(([']?[a-z])+[']?)([~]([']?[a-z])+[']?)*$/); }