#! /usr/bin/gawk -f # Last edited on 2012-05-05 21:04:01 by stolfilocal # Sampling functions for viet/ptt # The Catholic Vietnamese Gospels, lowercased, in VIQR encoding. # To be included in wds-to-tlw # The Gospels source file has many hyphenated compounds, and many # transliterated Hebrew nouns (mostly names but a few comon ones as well). # The latter can be distinguished by their capitalization, # and are written WITHOUT hyphenation, e.g. "Ye^su Kito^" for # "Jesus Christ". # # For compatibility with Voynichese and the Bible in other languages, # we must count proper names as words, after turning them into lower # case. For compatibility with other languages, like Chinese, we break # all hyphenated compounds (including foreign names) into separate syllables. # # The spelling conventions used in the Catholic Gospels are quite # different from those used in the Pentateuch, e.g. "Ysaac" versus # "Y-sa'c", "Ye^rico^" versus "Gie^-ri-co^". Perhaps the difference is # due to the pre-translation language (Vulgate versus King James?), or # to the Portuguese roots of the Catholic Church in Vietnam. Anyway, # in order to compatibilize the two files, we assume that the word-map # table {wmap} reencodes and hyphenates all capitalized Gospel names # according to the Pentateuch conventions (i.e. maps "Ye^rico^" to # "Gie^-ri-co^", etc.) function smp_define_patterns(smp,sec) { if ( \ (sec != "mat.1") && (sec != "mrk.1") && (sec != "luk.1") && (sec != "jhn.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } # This does not work, unfortunately: # # Some handy patterns (in the original VIQR encoding): # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])"; # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)"; # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)"; # viqr_tone_pat = "([`\'.\\?~]|)"; # viqr_word_pat = ( \ # "^" viqr_init_pat \ # viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \ # viqr_fin_pat "$" \ # ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters with postfix VIQR diacritics, # and hyphen (internal) return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Decide whether the line is inside or outside section (Book) {sec}: if (sec == "mat.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "mrk.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "luk.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "jhn.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /^{b[1-9][0-9]*}{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case, break at hyphens, # and restore VIQR diacritics. # Apply word-map table to fix spelling of foreign # names and hyphenate them: if (wd in wmap) { wd = wmap[wd]; } # Map everything to lower case: wd = tolower(wd); # Map "?" to "*" in case it was used for bad chars gsub(/[?]/, "*", wd); # Restore VIQR diacritics gsub(/[ß]/, "?", wd); gsub(/[µ]/, "(", wd); gsub(/[°]/, ".", wd); # Break at hyphens: gsub(/[-]/, " ", wd); return wd; }