#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:12:02 by stolfilocal # Sampling functions for russ/ptt # The "Pentateuch" in Russian (Synodal), lowercased, in KOI8-R encoding. # To be included in wds-to-tlw function smp_define_patterns(smp,sec, i,uc,lc) { if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } koi_ucs = "£ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß"; koi_lcs = "³àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; split(koi_ucs, uc, ""); split(koi_lcs, lc, ""); split("", koi_tolow); for (i in uc) { koi_tolow[uc[i]] = lc[i]; } for (i in lc) { koi_tolow[lc[i]] = lc[i]; } # This does not work unfortunately: # koi_word_pat = ( "^[" koi_lcs "]+$" ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters only: return (wd ~ /^(³|à|á|â|ã|ä|å|æ|ç|è|é|ê|ë|ì|í|î|ï|ð|ñ|ò|ó|ô|õ|ö|÷|ø|ù|ú|û|ü|ý|þ|ÿ)+$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any text outside Book {sec}: if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, n,i,tw) { # Map upper case to lower case, break at hyphens: # Must do the mapping by hand: n = length(wd); tw = ""; for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); } wd = tw; gsub(/[-]/, " ", wd); return wd; }