#! /usr/bin/gawk -f # Last edited on 2023-05-14 13:22:19 by stolfi # Sampling functions for russ/ptr # The "Pentateuch" in Russian (Synodal), lowercased, romanized. # To be included in wds-to-tlw # ** MUST BE SAVED IN ISO-LATIN-1 ** @include "./koi8r_to_latin.gawk" function smp_define_patterns(smp,sec, i,uc,lc) { if ( \ (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \ ) { data_error(("invalid section \"" sec "\"")); } koi_ucs = "³àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; koi_lcs = "£ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß"; split(koi_ucs, uc, ""); split(koi_lcs, lc, ""); split("", koi_tolow); for (i in uc) { koi_tolow[uc[i]] = lc[i]; } for (i in lc) { koi_tolow[lc[i]] = lc[i]; } # This does not work unfortunately: # koi_word_pat = ( "^[" koi_lcs "]+$" ); } function smp_is_good_word(smp,sec,type,wd) { # Accept lowercase letters only: return (wd ~ /^[a-z'"]+$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any text outside Book {sec}: if (sec == "gen.1") { if (cursec !~ /^{b1}/) { return "n"; } } else if (sec == "exo.1") { if (cursec !~ /^{b2}/) { return "n"; } } else if (sec == "lev.1") { if (cursec !~ /^{b3}/) { return "n"; } } else if (sec == "num.1") { if (cursec !~ /^{b4}/) { return "n"; } } else if (sec == "deu.1") { if (cursec !~ /^{b5}/) { return "n"; } } else if (sec == "tot.1") { if (cursec !~ /^{b[1-5]}/) { return "n"; } } else { arg_error(("bad subsection \"" subsec "\"")); } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Reject any material that is not verse text: if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; }; # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd, n,i,tw) { # Map upper case to lower case, break at hyphens: # Uppercase to lowercase - must do letter by letter: n = length(wd); tw = ""; for (i = 1; i <= n; i++) { tw = ( tw koi_tolow[substr(wd,i,1)] ); } wd = tw; # Break at hyphens: gsub(/[-]/, " ", wd); # Convert to romanization: wd = koi8r_to_latin(wd); return wd; }