# Last edited on 2002-01-16 03:01:00 by stolfi
# Sampling functions for tibe/vim and tibe/ccv
# Tibetan in ACIP-JS encoding: the Vimalakirti Sutra and 
# The Commentary on Commentary on Valid Reasoning. 
# To be included in select-evt-lines, fix-raw-words, select-gud-bad-words

function select_evt_line(subsec,chapter,unit,linenum)
{
  # Text is in a single part; use "P" type units (running text)
  # 
  if (subsec = "bod.1")
    { 
      return (unit ~ /^[P]/);
    }
  else
    { arg_error(("bad subsection \"" subsec "\"")); }
}

function fix_raw_word(word)
{
  # Tibetan ACIP-JS encoding has mostly uppercase but some lowercase,
  # so we preseve the case in this case.
  # 
  # The hyphen "-" seems to be used to make compound foreign words,
  # but after "GA" it is usually an encoding device to to disambiguate
  # letter parsing. Either remove it or break words at it.
  #
  gsub(/^GA[-]/, "GA", word);
  if (word !~ /^[-]*$/) { gsub(/[-]/, "\n", word); }
  return word;
}

function define_patterns()
{
  # Tibetan word patterns in ACIP-JS
  # 
  tibe_cons_pat = "(([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])";
  tibe_mid_vowel_pat = "([AIUEO]|EE|OO|['][AIU]|[RL][']?[i])";
  tibe_init_vowel_pat = "([A][AIUEO]|AEE|AOO|A['][AIU]|[RL][']?[i])";
  tibe_init_letter_pat = ( "(" tibe_cons_pat "|" tibe_init_vowel_pat ")" );
  tibe_mid_letter_pat = ( "(" tibe_cons_pat "|" tibe_mid_vowel_pat ")" );
  tibe_word_pat = ( "^" tibe_init_letter_pat tibe_mid_letter_pat "*$" );
}

function is_good_word(word)
{ 
  # Tibetan word - guess...
  #
  return ((word ~ tibe_word_pat));
}