#! /usr/bin/gawk -f # Last edited on 2012-05-05 22:45:07 by stolfilocal # Sampling functions for geez/gok # Ge'ez (classic Ethiopian) - "Glory of the Kings", # in the stadard SERA encoding. # To be included in wds-to-tlw function smp_define_patterns(smp,sec) { # No patterns needed if (sec != "tot.1") { data_error(("invalid section \"" sec "\"")); } } function smp_is_good_word(smp,sec,type,wd) { # Accept upper and lower case alpha, plus apostrophe # and backquote (as charater prefixes only): return ((wd ~ /^([`']?[a-zA-Z])+$/)); } function smp_reclassify_word(smp,sec,cursec,curlin,type,wd) { # Delete any material that is not ordinary chapters from the book proper # ("{b}{cN}{tx}") and from the introduction ("{i}{p}{tx}"). if (cursec !~ /^({b}{c[1-9][0-9]*}|{i}{p}){tx}/) { return "n"; } # Delete titles: if (cursec ~ /{(tt|cn)}$/) { return "n"; } # Within the selected sections, reject anything that is not prose text if (cursec !~ /{tx}$/) { return "x"; } # Discard punctuation other than parag breaks: if ((type == "p") && (wd != "=")) { return "n"; } return type; } function smp_fix_word(smp,sec,type,wd) { # Input is in SERA. # No special processing needed. return wd; }