#! /usr/bin/gawk -f # Last edited on 2012-05-05 21:03:20 by stolfilocal # Sampling functions for viep/mky # Pseudo-Vietnamese generated by an order-3 Markov monkey # To be included in wds-to-tlw # The input is already in lowercase, in modified VIQR encoding. function smp_define_patterns(smp,sec) { if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } # This does not work, unfortunately: # # Some handy patterns (in the original VIQR encoding): # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])"; # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)"; # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)"; # viqr_tone_pat = "([`\'.\\?~]|)"; # viqr_word_pat = ( \ # "^" viqr_init_pat \ # viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \ # viqr_fin_pat "$" \ # ); # # Bare pattern: # ([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|) # ([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])+ # ([`'.?~]|) # ([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])* # ([c][h]?|[n][gh]?|[mpt]|) } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters with postfix VIQR diacritics, # and hyphen (internal) return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Decide whether line is inside or outside section (Book) {sec}: if (sec == "tot.1") { if (cursec !~ /^{b1}{c[0-9]+}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Reject any material that is not "verse" text: if (cursec !~ /{v[0-9]+}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Map upper case to lower case, break at hyphens, # and restore VIQR diacritics. # MApe everything to lowercase: wd = tolower(wd); # Map "?" to "*" in case it was used for bad chars gsub(/[?]/, "*", wd); # Restore VIQR diacritics gsub(/[ß]/, "?", wd); gsub(/[µ]/, "(", wd); gsub(/[°]/, ".", wd); # Break at hyphens: gsub(/[-]/, " ", wd); return wd; }