#! /usr/bin/gawk -f # Last edited on 2004-02-26 06:25:49 by stolfi # Sampling functions for latn/nwt # The Four Gospels in Latin (Vulgate), mapped to lowercase. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if ( \ (sec != "mat.1") && (sec != "mrk.1") && (sec != "luk.1") && (sec != "joh.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any material outside Book {sec}: if (sec == "mat.1") { if (cursec !~ /^{b1}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } } else if (sec == "mrk.1") { if (cursec !~ /^{b2}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } } else if (sec == "luk.1") { if (cursec !~ /^{b3}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } } else if (sec == "joh.1") { if (cursec !~ /^{b4}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-4]}{c[1-9][0-9]*}{v[1-9][0-9]*}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case (just in case). wd = tolower(wd); return wd; } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, but no "w" or "j". # The text uses neither hyphen nor apostrophe. return (wd ~ /^[a-ik-vx-z]+$/); }