#! /usr/bin/gawk -f
# Last edited on 2012-05-05 19:49:35 by stolfilocal
# Sampling functions for chin/red
# Chinese - Dream of the Red Mansion in GB code,
# converted to uniquified pinyin.
# To be included in wds-to-tlw

# These functions assume that the word table {wmap} maps each GB
# byte-pair that corresponds to a Chinese character to a uniquified
# pinyin; GB punctuation codes to strings in gullemots, and symbols
# (stars, non-Chinese letters, etc.) to strings in parentheses.

function smp_define_patterns(smp,sec)
{
  if (sec != "tot.1") 
    { data_error(("invalid section \"" sec "\"")); }

  gb_bytes = ( \
    "¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" \
    "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞ" \
    "ßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþ" \
  );
  
  # This does not work, unfortunately:
  # # Pattern for Chinese GB codes (just in case):
  # # This does not work, unfortunately:
  # # gb_code_pat = ( "^[" gb_bytes "][" gb_bytes "]$" );
  # 
  # # Patterns for lowecase pinyin syllable 
  # # with numeric tone and disambiguating suffix:
  # pinyin_initial_pat = "([csz][h]|[bcdfghjklmnpqrstwxyz])?";
  # pinyin_middle_pat = "([aeiouüê]|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)";
  # pinyin_final_pat = "([n][g]?|[r])?";
  # pinyin_tone_pat = "[0-9]";
  # pinyin_dupl_pat = "([.][1-9][0-9]*)?";
  # pinyin_word_pat = ( \
  #   "^" \
  #   pinyin_initial_pat pinyin_middle_pat pinyin_final_pat \
  #   pinyin_tone_pat pinyin_dupl_pat \
  #   "$" \
  # );
}

function smp_is_good_word(smp,sec,type,wd)
{ 
  # Accept pinyin word with numeric tone and disambiguating suffix,
  # or raw GB code:
  return smp_is_GB_code(wd) || smp_is_pinyin_word(wd);
}

function smp_is_pinyin_word(wd)
{
  return \
    (wd ~ /([csz][h]|[bcdfghjklmnpqrstwxyz])?([aeiou]|ü|ê|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)([n][g]?|[r])?[0-9]([.][1-9][0-9]*)?$/)
}

function smp_is_GB_code(wd)
{
  return \
    (length(wd) == 2) &&
    (index(gb_bytes, substr(wd,1,1)) > 0) &&
    (index(gb_bytes, substr(wd,2,1)) > 0);
}

function smp_reclassify_word(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything outside section {sec}:
  if (sec == "tot.1")
    { if (cursec !~ /^{b[12]}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not prose text:
  if (cursec !~ /{c[1-9][0-9]*}{tx}$/) { return "x"; };

  if (type == "a")
    { # The word should be actually a GB byte-pair.
      if (! smp_is_GB_code(wd)) { data_error(("malformed GB code \"" wd "\"")); } 
      if (wd in wmap) 
        { wd = wmap[wd];
          if (wd ~ /^[«].+[»]$/) 
            { type = "p"; }
          else if (wd ~ /^[(].+[)]$/)
            { type = "s"; }
        }
      # If the GB code is not in the table, assume it is alpha.
   }

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }

  return type;
}

function smp_fix_word(smp,sec,type,wd)
{
  # Apply the word-map table
  if (wd in wmap) { wd = wmap[wd]; }
  
  if (wd ~ /^[a-zA-Z]/)
    { # Chinese character in pinyin.
      # reencode "u:" to "ü" and "e^" to "ê"
      wd = tolower(wd);
      gsub(/[u][:]/, "ü", wd);
      gsub(/[e][\^]/, "ê", wd);
    }
  return wd;
}