#! /usr/bin/gawk -f # Last edited on 2012-05-05 18:50:59 by stolfilocal # Sampling functions for envt/wow # English - Well's "War of the Worlds", lowercased, # with each word replaced by one or two Vietnamese words. # To be included in wds-to-tlw # These functions assume that the word table {wmap} maps each # lowercase English word that appears in the input to a distinct # Vietnamese word, in lowercase, or to an hyphenated compound function smp_define_patterns(smp,sec) { # No patterns needed if (smp != "envt/wow") { data_error(("wrong sample \"" smp "\"")); } if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } # This does not work, unfortunately: # # Some handy patterns: # viqr_vowel_pat = "(([a][\\(\\^]?)|([o][\\+\\^]?)|([u][\\+]?)|([e][\\^]?)|[iy])"; # viqr_letter_pat = ( "((" viqr_vowel_pat "[`'.\\?~]?)|[b-df-hj-np-tvwxz])" ); # viqr_word_pat = ( "^" viqr_letter_pat "+$" ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters with postfix VIQR diacritics, # and hyphen (internal) return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])+([`'.?~]|)([c][h]?|[n][gh]?|[mpt]|)$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{p[12]}{c[1-9][0-9]*}{tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject subsections that are not main text: if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, nfld,fld,res,sep,i) { if (type == "a") { # Map word to lowercase: wd = tolower(wd); # There are no accented letters in this sample. # Break English word at hyphens: gsub(/[~]/, " ", wd); # Apply word table to each piece (the word should be there): nfld = split(wd, fld); res = ""; sep = ""; for (i = 1; i <= nfld; i++) { wdi = fld[i]; if (wdi in wmap) { wdi = wmap[wdi]; } else { data_error(("word not in table \"" wdi "\"")); } # Break at Vietnamese hyphens: gsub(/[-]/, " ", wdi); res = ( res sep wdi ); sep = " "; } wd = res; } return wd; }