#! /usr/bin/gawk -f # Last edited on 2004-05-26 06:10:27 by stolfi # Sampling functions for engl/cul # English - main text from Culpeper's herbal, mapped to lowercase. # Only alpha words from the text, split at hyphens, mapped to lowercase. # Note that hyphens are encoded as "~", abbrev dots as "/". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd, valid) { # Delete everything except text of plays if (cursec !~ /^{op}{p[0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{tt}$/) { return "n"; } # Reject subsections that are not main text: if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } # Reclassify "&" and "&c/" as alpha, just in case: if (wd == "&") { return "a"; } if (wd == "&c/") { return "a"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # Break at hyphens: gsub(/[~]/, " ", wd); return wd; } function smp_is_good_word(smp,sec,type,wd) { # Accept "&" and "&c/" alone or lowercase alpha plus apostrophe and "/". # Apostrophes can't be doubled. # Note that 'tis OK to begin an' end with apostrophe! if (wd == "&") { return 1; } if (wd == "&c/") { return 1; } return (wd ~ /^([\']?[a-z])+[\'\/]?$/); # # The following allows hyphenated words. # Note that each word of # an hyphenated compound must contain at least one letter. # return (wd ~ /^(([']?[a-z])+[']?)([-]([']?[a-z])+['\/]?)*$/); }