#! /usr/bin/gawk -f # Last edited on 2004-02-26 22:58:02 by stolfi # Sampling functions for enrc/wow # English - Well's "War of the Worlds", lowercased, # converted to Roman numerals code. # To be included in wds-to-tlw # These functions assume that the word table {wmap} maps each # lowercase word that appears in the input to a distinct # Roman numeral, in lowercase, prefixed by "@". function smp_define_patterns(smp,sec) { # No patterns needed if (smp != "enrc/wow") { data_error(("wrong sample \"" smp "\"")); } if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject subsections that are not main text: if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, nfld,fld,res,sep,i) { if (type == "a") { # Map word to lowercase: wd = tolower(wd); # There are no accented letters in this sample. # Break at hyphens: gsub(/[~]/, " ", wd); # Apply word table to each piece (the word should be there): nfld = split(wd, fld); res = ""; sep = ""; for (i = 1; i <= nfld; i++) { wdi = fld[i]; if (wdi in wmap) { wdi = wmap[wdi]; } else { data_error(("word not in table \"" wdi "\"")); } # Remove the leading "@", or mark with "?" if (wdi ~ /^[@]/) { gsub(/^[@]/, "", wdi); } else { wdi = ("?" wdi "?"); } res = ( res sep wdi ); sep = " "; } wd = res; } return wd; } function smp_is_good_word(smp,sec,type,wd) { # The Roman numeral code uses lowercase letters only, # with p = 5000, b = 10000. return (wd ~ /^[ivxlcdmpb]+$/); }