#! /usr/bin/gawk -f
# Last edited on 2012-05-05 19:49:25 by stolfilocal
# Sampling functions for chin/ptt
# Chinese - Pentateuch (Union) in GB encoding,
# converted to uniquified pinyin.
# To be included in wds-to-tlw

# These functions assume that the word table {wmap} maps each GB
# byte-pair that corresponds to a Chinese character to a uniquified
# pinyin; GB punctuation codes to strings in gullemots, and symbols
# (stars, non-Chinese letters, etc.) to strings in parentheses.

function smp_define_patterns(smp,sec)
{
  if ( \
    (sec != "gen.1") && (sec != "exo.1") && (sec != "lev.1") && 
    (sec != "num.1") && (sec != "deu.1") && (sec != "tot.1") \
  ) 
    { data_error(("invalid section \"" sec "\"")); }

  gb_bytes = ( \
    "ĄĒĢĪĨĶ§ĻĐŠŦŽŪŊ°ąēģīĩķ·ļđšŧž―ūŋ" \
    "ĀÁÂÃÄÅÆĮČÉĘËĖÍÎÏÐŅŌÓÔÕÖŨØŲÚÛÜÝÞ" \
    "ßāáâãäåæįčéęëėíîïðņōóôõöũøųúûüýþ" \
  );
  
  # This does not work, unfortunately:
  # Pattern for Chinese GB codes (just in case):
  # gb_code_pat = ( "^[" gb_bytes "][" gb_bytes "]$" );
  # 
  # # Patterns for lowecase pinyin syllable 
  # # with numeric tone and disambiguating suffix:
  # pinyin_initial_pat = "([csz][h]|[bcdfghjklmnpqrstwxyz])?";
  # pinyin_middle_pat = "([aeiou\ü\ę]|a[io]|ei|i[aeou]|iao|ou|\üe|u[aeio]|uai)";
  # pinyin_final_pat = "([n][g]?|[r])?";
  # pinyin_tone_pat = "[0-9]";
  # pinyin_dupl_pat = "([.][1-9][0-9]*)?";
  # pinyin_word_pat = ( \
  #   "^" \
  #   pinyin_initial_pat pinyin_middle_pat pinyin_final_pat \
  #   pinyin_tone_pat pinyin_dupl_pat \
  #   "$" \
  # );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept pinyin word with numeric tone and disambiguating suffix,
  # or raw GB code:
  return smp_is_GB_code(wd) || smp_is_pinyin_word(wd);
}

function smp_is_pinyin_word(wd)
{
  return \
    (wd ~ /([csz][h]|[bcdfghjklmnpqrstwxyz])?([aeiou]|ü|ę|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)([n][g]?|[r])?[0-9]([.][1-9][0-9]*)?$/)
}

function smp_is_GB_code(wd)
{
  return \
    (length(wd) == 2) &&
    (index(gb_bytes, substr(wd,1,1)) > 0) &&
    (index(gb_bytes, substr(wd,2,1)) > 0);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything outside Book {sec}:
  if (sec == "gen.1")
    { if (cursec !~ /^{b1}/) { return "n"; } }
  else if (sec == "exo.1")
    { if (cursec !~ /^{b2}/) { return "n"; } }
  else if (sec == "lev.1")
    { if (cursec !~ /^{b3}/) { return "n"; } }
  else if (sec == "num.1")
    { if (cursec !~ /^{b4}/) { return "n"; } }
  else if (sec == "deu.1")
    { if (cursec !~ /^{b5}/) { return "n"; } }
  else if (sec == "tot.1")
    { if (cursec !~ /^{b[1-5]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not verse text:
  if (cursec !~ /{c[1-9][0-9]*}{v[1-9][0-9]*}$/) { return "x"; };

  if (type == "a")
    { # The word should be actually a GB byte-pair.
      if (! smp_is_GB_code(wd)) { data_error(("malformed GB code \"" wd "\"")); } 
      if (wd in wmap) 
        { wd = wmap[wd];
          if (wd ~ /^[Ŧ].+[ŧ]$/) 
            { type = "p"; }
          else if (wd ~ /^[(].+[)]$/)
            { type = "s"; }
        }
      # If the GB code is not in the table, assume it is alpha.
    }

  # Discard punctuation other than (non-GB) parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Apply the word-map table
  if (wd in wmap) { wd = wmap[wd]; }
  
  if (wd ~ /^[a-zA-Z]/)
    { # Chinese character in pinyin.
      # reencode "u:" to "ü" and "e^" to "ę"
      wd = tolower(wd);
      gsub(/[u][:]/, "ü", wd);
      gsub(/[e][\^]/, "ę", wd);
    }
  return wd;
}