#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:12:02 by stolfilocal # Sampling functions for russ/ptt # The "Pentateuch" in Russian (Synodal), lowercased, in KOI8-R encoding. # To be included in wds-to-tlw function smp_define_patterns(smp,sec, i,uc,lc) { if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } koi_ucs = "ёюабцдефгхийклмнопярстужвьызшэщчъ"; koi_lcs = "ЁЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ"; split(koi_ucs, uc, ""); split(koi_lcs, lc, ""); split("", koi_tolow); for (i in uc) { koi_tolow[uc[i]] = lc[i]; } for (i in lc) { koi_tolow[lc[i]] = lc[i]; } # This does not work unfortunately: # koi_word_pat = ( "^[" koi_lcs "]+$" ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters only: return (wd ~ /^(Ё|Ю|А|Б|Ц|Д|Е|Ф|Г|Х|И|Й|К|Л|М|Н|О|П|Я|Р|С|Т|У|Ж|В|Ь|Ы|З|Ш|Э|Щ|Ч|Ъ)+$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any text outside Book {sec}: if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, n,i,tw) { # Map upper case to lower case, break at hyphens: # Must do the mapping by hand: n = length(wd); tw = ""; for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); } wd = tw; gsub(/[-]/, " ", wd); return wd; }