# Last edited on 2002-01-16 03:01:00 by stolfi # Sampling functions for tibe/vim and tibe/ccv # Tibetan in ACIP-JS encoding: the Vimalakirti Sutra and # The Commentary on Commentary on Valid Reasoning. # To be included in select-evt-lines, fix-raw-words, select-gud-bad-words function select_evt_line(subsec,chapter,unit,linenum) { # Text is in a single part; use "P" type units (running text) # if (subsec = "bod.1") { return (unit ~ /^[P]/); } else { arg_error(("bad subsection \"" subsec "\"")); } } function fix_raw_word(word) { # Tibetan ACIP-JS encoding has mostly uppercase but some lowercase, # so we preseve the case in this case. # # The hyphen "-" seems to be used to make compound foreign words, # but after "GA" it is usually an encoding device to to disambiguate # letter parsing. Either remove it or break words at it. # gsub(/^GA[-]/, "GA", word); if (word !~ /^[-]*$/) { gsub(/[-]/, "\n", word); } return word; } function define_patterns() { # Tibetan word patterns in ACIP-JS # tibe_cons_pat = "(([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])"; tibe_mid_vowel_pat = "([AIUEO]|EE|OO|['][AIU]|[RL][']?[i])"; tibe_init_vowel_pat = "([A][AIUEO]|AEE|AOO|A['][AIU]|[RL][']?[i])"; tibe_init_letter_pat = ( "(" tibe_cons_pat "|" tibe_init_vowel_pat ")" ); tibe_mid_letter_pat = ( "(" tibe_cons_pat "|" tibe_mid_vowel_pat ")" ); tibe_word_pat = ( "^" tibe_init_letter_pat tibe_mid_letter_pat "*$" ); } function is_good_word(word) { # Tibetan word - guess... # return ((word ~ tibe_word_pat)); }