#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:13:46 by stolfilocal # Sampling functions for russ/pic # Russian - "Piknik na obochine" by the Brothers Strugatsky # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept only lowercase alpha, "é", plus apostrophe and "^". # Apostrophes and "^" can't be doubled or occur next to each other. # They cannot occur at the beginning. # The "é" can only occur at the beginning, it seems. # The "°" can only occur at the end of the word. return (wd ~ /^([a-z]|é)(['^]?([a-z]|é))*['^]?°?$/); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete all but ordinary chapters (omit opening quote) if (cursec !~ /^{b}{c[1-9][0-9]*}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not Russian prose if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Break at hyphens: gsub(/[-]/, " ", wd); # Map upper case to lower case: wd = tolower(wd); # Cannot trust that LOCALE crap: gsub(/É/, "é", wd); # The soft-sign "'" and hard-sign "^" are never word-initial # so they do not occur in uppercase. # (We assume that there are no all-caps words). return wd; }