# Last edited on 2002-01-16 03:02:02 by stolfi
# Sampling functions for viet/ptt
# The "Pentateuch" in Vietnamese (Cadman), lowercased, in VIQR encoding.
# To be included in select-evt-lines, fix-raw-words, select-gud-bad-words

function select_evt_line(subsec,chapter,unit,linenum)
{
  # Consider any part, running text (unit type "P"):
  if (subsec = "bod.1")
    { 
      return ((chapter ~ /^[abcde]/) && (unit ~ /^[P]/));
    }
  else
    { arg_error(("bad subsection \"" subsec "\"")); }
}

function fix_raw_word(word)
{
  # Map upper case to lower case, break at hyphens,
  # and restore VIQR diacritics.
  word = tolower(word);
  # Map "?" to "*" in case it was used for bad chars
  gsub(/[?]/, "*", word);
  # Restore VIQR diacritics
  gsub(/[ß]/, "?", word);
  gsub(/[µ]/, "(", word);
  gsub(/[°]/, ".", word);
  # Break at hyphens:
  if (word !~ /^[-]*$/) { gsub(/[-]/, "\n", word); }
  return word;
}

function define_patterns()
{
  viqr_vowel_pat = "(([a][\\(\\^]?)|([o][\\+\\^]?)|([u][\\+]?)|([e][\\^]?)|[iy])";
  viqr_letter_pat = ( "((" viqr_vowel_pat "[`'.\\?~]?)|[b-df-hj-np-tvwxz])" );
  viqr_word_pat = ( "^" viqr_letter_pat "+$" );

  # The following applies if hyphenated words are OK:
  # viqr_word_pat = ( "^" viqr_letter_pat "([-]?" viqr_letter_pat ")*$" );
}

function is_good_word(word)
{ 
  # Accept lowercase letters with postfix diacritics, 
  # and hyphen (internal)
  return (word ~ viqr_word_pat);
}