#! /usr/bin/gawk -f
# Last edited on 2012-05-05 18:57:34 by stolfilocal
# Sampling functions for tibe/vim, tibe/ccv, tibe/pmi:
# Tibetan in ACIP-JS encoding. 
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  if (sec != "tot.1") 
    { data_error(("invalid section \"" sec "\"")); }

  # This does not work unfortunately:
  # # Tibetan word patterns in ACIP-JS
  # # Note that fix-word (below) replaces "GA-" by "GA°".
  # tibe_cons_pat = "(([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])";
  # tibe_mid_vowel_pat = "([A][°]?|[IUEO]|EE|OO|[\'][AIU]|[RL][\']?[i])";
  # tibe_init_vowel_pat = "([A][AIUEO]|AEE|AOO|A[\'][AIU]|[RL][\']?[i])";
  # tibe_init_letter_pat = ( "(" tibe_cons_pat "|" tibe_init_vowel_pat ")" );
  # tibe_mid_letter_pat = ( "(" tibe_cons_pat "|" tibe_mid_vowel_pat ")" );
  # tibe_token_pat = ( "^" tibe_init_letter_pat tibe_mid_letter_pat "*$" );
}

function smp_is_good_token(smp,sec,type,wd)
{ 
  # Tibetan word - guess...
  #
  return (wd ~ /^((([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])|([A][AIUEO]|AEE|AOO|A[\'][AIU]|[RL][\']?[i]))((([KGTDPBCSZtd]|[D][Z])[H]?|[NMnJVWYRLH]|NG|NY|TS|TZ|sh|Ksh|['])|([A][°]?|[IUEO]|EE|OO|[\'][AIU]|[RL][\']?[i]))*$/);
}

function smp_reclassify_token(smp,sec,cursec,curlin,type,wd,  inside)
{
  # Delete all but ordinary book text
  if (sec == "tot.1")
    { if (cursec !~ /^{b}{f[0-9]+}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Within the selected sections, reject anything that is not prose text
  if (cursec !~ /{f[0-9]+}$/) { return "x"; }
  
  # Discard punctuation other than parag breaks: 
  if ((type == "p") && ( wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_token(smp,sec,type,wd)
{
  # Tibetan ACIP-JS encoding has mostly uppercase but uses
  # lowercase in some significant cases, so we preseve the case in this case.
  # 
  # The hyphen "-" seems to be used to make compound foreign words,
  # but after "GA" it is usually an encoding device to disambiguate
  # letter parsing. We map the latter to "°" (this should be done
  # in the source!) and delete the former.
  #
  gsub(/^GA[-]/, "GA°", wd);
  gsub(/[-]/, "\n", wd);
  return wd;
}