#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:46:39 by stolfilocal # Sampling functions for engl/cul # English - main text from Culpeper's herbal, mapped to lowercase. # Only alpha words from the text, split at hyphens, mapped to lowercase. # Note that hyphens are encoded as "~", abbrev dots as "/". # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if ((sec != "pre.1") && (sec != "her.1") && (sec != "rec.1") && (sec != "tot.1")) { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept "&" and "&c/" alone or lowercase alpha, plus apostrophe and "/". # Apostrophes can't be doubled. # Note that 'tis OK to begin an' end with apostrophe! if (wd == "&") { return 1; } if (wd == "&c/") { return 1; } return (wd ~ /^([']?[a-z])+['\/]?$/); # # The following allows hyphenated words. # Note that each word of # an hyphenated compound must contain at least one letter. # return (wd ~ /^(([']?[a-z])+[']?)([-]([']?[a-z])+['\/]?)*$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd, valid) { if ((sec == "pre.1") || ((sec == "tot.1") && (cursec ~ /^{pro}/))) { # Delete material outside the "To the reader" section: if (cursec !~ /^{pro}{tordr}{tx}/) { return "n"; } # Reject subsections that are not plain text: if (cursec !~ /{tx}(|{ius}{txu})$/) { return "x"; } } else if ((sec == "her.1") || ((sec == "tot.1") && (cursec ~ /^{hb}/))) { # Delete material outside the herbal proper # (omit herb 0 in the prologue, which may be atypical): if (cursec !~ /^{hb}{h[1-9][0-9]*}{tx}/) { return "n"; } if (cursec !~ /{tx}(|{s[A-Z]}{txs})$/) { return "x"; } } else if ((sec == "rec.1") || ((sec == "tot.1") && (cursec ~ /^{rx}/))) { # Delete material outside the recipes section proper, text in numbered paragraphs, # including the initial "Directions" section: if (cursec !~ /^{rx}{s[12]}{bd}{c[1-9][0-9]*}{tx/) { return "n"; } if (cursec !~ /{tx[0-9]*}$/) { return "x"; } } else { arg_error(("bad output section \"" sec "\"")); } # Reclassify "&" and "&c/" as alpha, just in case: if (wd == "&") { return "a"; } if (wd == "&c/") { return "a"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case: wd = tolower(wd); # Break at hyphens: gsub(/[~]/, " ", wd); return wd; }