# Last edited on 2002-01-16 03:02:02 by stolfi # Sampling functions for viet/ptt # The "Pentateuch" in Vietnamese (Cadman), lowercased, in VIQR encoding. # To be included in select-evt-lines, fix-raw-words, select-gud-bad-words function select_evt_line(subsec,chapter,unit,linenum) { # Consider any part, running text (unit type "P"): if (subsec = "bod.1") { return ((chapter ~ /^[abcde]/) && (unit ~ /^[P]/)); } else { arg_error(("bad subsection \"" subsec "\"")); } } function fix_raw_word(word) { # Map upper case to lower case, break at hyphens, # and restore VIQR diacritics. word = tolower(word); # Map "?" to "*" in case it was used for bad chars gsub(/[?]/, "*", word); # Restore VIQR diacritics gsub(/[ß]/, "?", word); gsub(/[µ]/, "(", word); gsub(/[°]/, ".", word); # Break at hyphens: if (word !~ /^[-]*$/) { gsub(/[-]/, "\n", word); } return word; } function define_patterns() { viqr_vowel_pat = "(([a][\\(\\^]?)|([o][\\+\\^]?)|([u][\\+]?)|([e][\\^]?)|[iy])"; viqr_letter_pat = ( "((" viqr_vowel_pat "[`'.\\?~]?)|[b-df-hj-np-tvwxz])" ); viqr_word_pat = ( "^" viqr_letter_pat "+$" ); # The following applies if hyphenated words are OK: # viqr_word_pat = ( "^" viqr_letter_pat "([-]?" viqr_letter_pat ")*$" ); } function is_good_word(word) { # Accept lowercase letters with postfix diacritics, # and hyphen (internal) return (word ~ viqr_word_pat); }