#! /usr/bin/gawk -f # Last edited on 2023-05-14 13:22:31 by stolfi # Sampling functions for russ/ptt # The "Pentateuch" in Russian (Synodal), lowercased, in KOI8-R encoding. # To be included in wds-to-tlw # ** MUST BE SAVED IN ISO-LATIN-1 ** function smp_define_patterns(smp,sec, i,uc,lc) { if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } koi_ucs = "³àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; koi_lcs = "£ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß"; split(koi_ucs, uc, ""); split(koi_lcs, lc, ""); split("", koi_tolow); for (i in uc) { koi_tolow[uc[i]] = lc[i]; } for (i in lc) { koi_tolow[lc[i]] = lc[i]; } # This does not work unfortunately: # koi_word_pat = ( "^[" koi_lcs "]+$" ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters only: return (wd ~ /^([£]|[À]|[Á]|[Â]|[Ã]|[Ä]|[Å]|[Æ]|[Ç]|[È]|[É]|[Ê]|[Ë]|[Ì]|[Í]|[Î]|[Ï]|[Ð]|[Ñ]|[Ò]|[Ó]|[Ô]|[Õ]|[Ö]|[×]|[Ø]|[Ù]|[Ú]|[Û]|[Ü]|[Ý]|[Þ]|[ß])+$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any text outside Book {sec}: # printf " ## %s/%s {%s}{%s} %s:[%s]\n", smp,sec,cursec,curlin,type,wd > "/dev/stderr"; if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, n,i,tw) { # Map upper case to lower case, break at hyphens: # Must do the mapping by hand: n = length(wd); tw = ""; for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); } wd = tw; gsub(/[-]/, " ", wd); return wd; }