#! /usr/bin/gawk -f
# Last edited on 2012-05-05 21:04:13 by stolfilocal
# Sampling functions for viet/ptt
# The "Pentateuch" in Vietnamese (Cadman), lowercased, in VIQR encoding.
# To be included in wds-to-tlw

# The Pentateuch source file has many hyphenated compounds, and many
# transliterated Hebrew nouns (mostly names but a few comon ones as well).
# The latter can be distinguished by their capitalization;
# they are written as hyphenated compounds of Vietnamese-like syllables,
# like "A'ch-te^-ro't-Ca-na-im" for "Astaroth-Carnaim". 
#  
# For compatibility with Voynichese and the Bible in other languages,
# we must count proper names as words, after turning them into lower
# case. For compatibility with other languages, like Chinese, we break
# all hyphenated compounds (including the foreign names) into separate
# syllables.

function smp_define_patterns(smp,sec)
{
  if ( \
    (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && 
    (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }

  # This does not work, unfortunately:
  # # Some handy patterns (in the original VIQR encoding):
  # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])";
  # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)";
  # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)";
  # viqr_tone_pat = "([`'.\\?~]|)";
  # viqr_word_pat = ( \
  #   "^" viqr_init_pat \
  #   viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \
  #   viqr_fin_pat "$" \
  # );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters with postfix VIQR diacritics, 
  # and hyphen (internal)
  return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Decide whether line is inside or outside section (Book) {sec}:
  if (sec == "gen.1")
    { if (cursec !~ /^{b1}/) { return "n"; } }
  else if (sec == "exo.1")
    { if (cursec !~ /^{b2}/) { return "n"; } }
  else if (sec == "lev.1")
    { if (cursec !~ /^{b3}/) { return "n"; } }
  else if (sec == "num.1")
    { if (cursec !~ /^{b4}/) { return "n"; } }
  else if (sec == "deu.1")
    { if (cursec !~ /^{b5}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-5]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /^{b[1-9][0-9]*}{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case, break at hyphens,
  # and restore VIQR diacritics.

  # MApe everything to lowercase:
  wd = tolower(wd);

  # Map "?" to "*" in case it was used for bad chars
  gsub(/[?]/, "*", wd);

  # Restore VIQR diacritics
  gsub(/[ß]/, "?", wd);
  gsub(/[µ]/, "(", wd);
  gsub(/[°]/, ".", wd);

  # Break at hyphens:
  gsub(/[-]/, " ", wd);
  return wd;
}