#! /usr/bin/gawk -f # Last edited on 2012-05-05 20:38:39 by stolfilocal # Sampling functions for chrc/red # Chinese - Dream of the Red Mansion in GB code, # converted to Voynichese-looking Roman-like code # To be included in select-evt-lines, fix-raw-words, select-gud-bad-words # These functions assume that the word table {wmap} maps each # lowercase pinyin word that appears in the input to a distinct # "pseudo-Voynich" Roman-style numeral, in lowercase, prefixed by "@". function smp_define_patterns(smp,sec) { if (smp != "chrc/red") { data_error(("wrong sample \"" smp "\"")); } if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } gb_bytes = ( \ "¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" \ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞ" \ "ßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ" \ ); } function smp_is_good_word(smp,sec,type,wd) { # The Voynichese-like pseudo-Roman code uses lowercase letters only return (word ~ /^[aoydsrlciektph]+$/); } function smp_is_GB_code(wd) { return \ (length(wd) == 2) && (index(gb_bytes, substr(wd,1,1)) > 0) && (index(gb_bytes, substr(wd,2,1)) > 0); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete anything outside section {sec}: if (sec == "tot.1") { if (cursec !~ /^{b[12]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not prose text: if (cursec !~ /{c[1-9][0-9]*}{tx}$/) { return "x"; }; if (type == "a") { # The word should be actually a GB byte-pair. # Map it through the code table: if (! smp_is_GB_code(wd)) { data_error(("malformed GB code \"" wd "\"")); } if (wd in wmap) { wd = wmap[wd]; if (wd ~ /^[«].+[»]$/) { type = "p"; } else if (wd ~ /^[(].+[)]$/) { type = "s"; } } else { data_error(("word not in code table \"" wd "\"")); } } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, nfld,fld,res,sep) { if (type == "a") { # The word should be a GB byte-pair. # Map it through the code table: if (wd in wmap) { wd = wmap[wd]; } else { data_error(("word not in table \"" wd "\"")); } # Remove the leading "@", or mark with "?" if (wd ~ /^[@]/) { gsub(/^[@]/, "", wd); } else { wd = ("?" wd "?"); } } return wd; }