#! /usr/bin/gawk -f # Last edited on 2012-05-05 18:57:34 by stolfilocal # Sampling functions for tibe/vim, tibe/ccv, tibe/pmi: # Tibetan in ACIP-JS encoding. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } # This does not work unfortunately: # # Tibetan word patterns in ACIP-JS # # Note that fix-word (below) replaces "GA-" by "GA°". # tibe_cons_pat = "(([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])"; # tibe_mid_vowel_pat = "([A][°]?|[IUEO]|EE|OO|[\'][AIU]|[RL][\']?[i])"; # tibe_init_vowel_pat = "([A][AIUEO]|AEE|AOO|A[\'][AIU]|[RL][\']?[i])"; # tibe_init_letter_pat = ( "(" tibe_cons_pat "|" tibe_init_vowel_pat ")" ); # tibe_mid_letter_pat = ( "(" tibe_cons_pat "|" tibe_mid_vowel_pat ")" ); # tibe_token_pat = ( "^" tibe_init_letter_pat tibe_mid_letter_pat "*$" ); } function smp_is_good_token(smp,sec,type,wd) { # Tibetan word - guess... # return (wd ~ /^((([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])|([A][AIUEO]|AEE|AOO|A[\'][AIU]|[RL][\']?[i]))((([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])|([A][°]?|[IUEO]|EE|OO|[\'][AIU]|[RL][\']?[i]))*$/); } function smp_reclassify_token(smp,sec,cursec,curlin,type,wd, inside) { # Delete all but ordinary book text if (sec == "tot.1") { if (cursec !~ /^{b}{f[0-9]+}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not prose text if (cursec !~ /{f[0-9]+}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && ( wd != "=")) { return "n"; } return type; } function smp_fix_token(smp,sec,type,wd) { # Tibetan ACIP-JS encoding has mostly uppercase but uses # lowercase in some significant cases, so we preseve the case in this case. # # The hyphen "-" seems to be used to make compound foreign words, # but after "GA" it is usually an encoding device to disambiguate # letter parsing. We map the latter to "°" (this should be done # in the source!) and delete the former. # gsub(/^GA[-]/, "GA°", wd); gsub(/[-]/, "\n", wd); return wd; }