#! /usr/bin/gawk -f # Last edited on 2012-05-05 21:04:13 by stolfilocal # Sampling functions for viet/ptt # The "Pentateuch" in Vietnamese (Cadman), lowercased, in VIQR encoding. # To be included in wds-to-tlw # The Pentateuch source file has many hyphenated compounds, and many # transliterated Hebrew nouns (mostly names but a few comon ones as well). # The latter can be distinguished by their capitalization; # they are written as hyphenated compounds of Vietnamese-like syllables, # like "A'ch-te^-ro't-Ca-na-im" for "Astaroth-Carnaim". # # For compatibility with Voynichese and the Bible in other languages, # we must count proper names as words, after turning them into lower # case. For compatibility with other languages, like Chinese, we break # all hyphenated compounds (including the foreign names) into separate # syllables. function smp_define_patterns(smp,sec) { if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } # This does not work, unfortunately: # # Some handy patterns (in the original VIQR encoding): # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])"; # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)"; # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)"; # viqr_tone_pat = "([`'.\\?~]|)"; # viqr_word_pat = ( \ # "^" viqr_init_pat \ # viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \ # viqr_fin_pat "$" \ # ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters with postfix VIQR diacritics, # and hyphen (internal) return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Decide whether line is inside or outside section (Book) {sec}: if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b[1-9][0-9]*}{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case, break at hyphens, # and restore VIQR diacritics. # MApe everything to lowercase: wd = tolower(wd); # Map "?" to "*" in case it was used for bad chars gsub(/[?]/, "*", wd); # Restore VIQR diacritics gsub(/[ß]/, "?", wd); gsub(/[µ]/, "(", wd); gsub(/[°]/, ".", wd); # Break at hyphens: gsub(/[-]/, " ", wd); return wd; }