#! /usr/bin/gawk -f
# Last edited on 2012-05-05 21:03:20 by stolfilocal
# Sampling functions for viep/mky
# Pseudo-Vietnamese generated by an order-3 Markov monkey
# To be included in wds-to-tlw

# The input is already in lowercase, in modified VIQR encoding.

function smp_define_patterns(smp,sec)
{
  if (sec != "tot.1")
    { data_error(("invalid section \"" sec "\"")); }

  # This does not work, unfortunately:
  # # Some handy patterns (in the original VIQR encoding):
  # viqr_vowel_pat = "([a][\\(\\^]?|[o][\\+\\^]?|[u][\\+]?|[e][\\^]?|[iy])";
  # viqr_init_pat = "([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)";
  # viqr_fin_pat = "([c][h]?|[n][gh]?|[mpt]|)";
  # viqr_tone_pat = "([`\'.\\?~]|)";
  # viqr_word_pat = ( \
  #   "^" viqr_init_pat \
  #   viqr_vowel_pat "+" viqr_tone_pat viqr_vowel_pat "*" \
  #   viqr_fin_pat "$" \
  # );
  # 
  # Bare pattern:
  # ([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)
  # ([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])+ 
  # ([`'.?~]|)
  # ([a][(^]?|[o][+^]?|[u][+]?|[e][^]?|[iy])*
  # ([c][h]?|[n][gh]?|[mpt]|)
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept lowercase letters with postfix VIQR diacritics, 
  # and hyphen (internal)
  return (wd ~ /^([d][d]?|[ckg][h]?|[n][g]?[h]?|[p][h]|[q]|[t][rh]?|[bfhjlmrsvx]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])+([`'.?~]|)([a][(^]?|[o][+^]?|[u][+]?|[e][\^]?|[iy])*([c][h]?|[n][gh]?|[mpt]|)$/);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Decide whether line is inside or outside section (Book) {sec}:
  if (sec == "tot.1")
    { if (cursec !~ /^{b1}{c[0-9]+}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Reject any material that is not "verse" text:
  if (cursec !~ /{v[0-9]+}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Map upper case to lower case, break at hyphens,
  # and restore VIQR diacritics.

  # MApe everything to lowercase:
  wd = tolower(wd);

  # Map "?" to "*" in case it was used for bad chars
  gsub(/[?]/, "*", wd);

  # Restore VIQR diacritics
  gsub(/[ß]/, "?", wd);
  gsub(/[µ]/, "(", wd);
  gsub(/[°]/, ".", wd);

  # Break at hyphens:
  gsub(/[-]/, " ", wd);
  return wd;
}