#! /usr/bin/gawk -f
# Last edited on 2012-05-05 21:04:01 by stolfilocal
# Sampling functions for viet/ptt
# The Catholic Vietnamese Gospels, lowercased, in VIQR encoding.
# To be included in wds-to-tlw

# The Gospels source file has many hyphenated compounds, and many
# transliterated Hebrew nouns (mostly names but a few comon ones as well).
# The latter can be distinguished by their capitalization, 
# and are written WITHOUT hyphenation, e.g. "Ye^su Kito^" for 
# "Jesus Christ". 
# 
# For compatibility with Voynichese and the Bible in other languages,
# we must count proper names as words, after turning them into lower
# case. For compatibility with other languages, like Chinese, we break
# all hyphenated compounds (including foreign names) into separate syllables.
# 
# The spelling conventions used in the Catholic Gospels are quite
# different from those used in the Pentateuch, e.g. "Ysaac" versus
# "Y-sa'c", "Ye^rico^" versus "Gie^-ri-co^". Perhaps the difference is
# due to the pre-translation language (Vulgate versus King James?), or
# to the Portuguese roots of the Catholic Church in Vietnam. Anyway,
# in order to compatibilize the two files, we assume that the word-map
# table {wmap} reencodes and hyphenates all capitalized Gospel names
# according to the Pentateuch conventions (i.e. maps "Ye^rico^" to
# "Gie^-ri-co^", etc.)

function smp_define_patterns(smp,sec)
{
  if ( \
    (sec != "mat.1") && (sec != "mrk.1") && 
    (sec != "luk.1") && (sec != "jhn.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }

  # This does not work, unfortunately:
  # # Some handy patterns (in the original VIQR encoding):
  # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])";
  # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)";
  # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)";
  # viqr_tone_pat = "([`\'.\\?~]|)";
  # viqr_word_pat = ( \
  #   "^" viqr_init_pat \
  #   viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \
  #   viqr_fin_pat "$" \
  # );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters with postfix VIQR diacritics, 
  # and hyphen (internal)
  return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Decide whether the line is inside or outside section (Book) {sec}:
  if (sec == "mat.1")
    { if (cursec !~ /^{b1}/) { return "n"; } }
  else if (sec == "mrk.1")
    { if (cursec !~ /^{b2}/) { return "n"; } }
  else if (sec == "luk.1")
    { if (cursec !~ /^{b3}/) { return "n"; } }
  else if (sec == "jhn.1")
    { if (cursec !~ /^{b4}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-5]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /^{b[1-9][0-9]*}{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case, break at hyphens,
  # and restore VIQR diacritics.

  # Apply word-map table to fix spelling of foreign 
  # names and hyphenate them:
  if (wd in wmap) { wd = wmap[wd]; }
  
  # Map everything to lower case: 
  wd = tolower(wd);

  # Map "?" to "*" in case it was used for bad chars
  gsub(/[?]/, "*", wd);
  
  # Restore VIQR diacritics
  gsub(/[ß]/, "?", wd);
  gsub(/[µ]/, "(", wd);
  gsub(/[°]/, ".", wd);
  
  # Break at hyphens:
  gsub(/[-]/, " ", wd);

  return wd;
}