#! /usr/bin/gawk -f
# Last edited on 2012-05-05 19:50:09 by stolfilocal
# Sampling functions for chin/voa
# Chinese - Voice of America newscasts in pinyin.
# To be included in wds-to-tlw

function smp_define_patterns(smp,sec)
{
  if (sec != "tot.1")
    { data_error(("invalid section \"" sec "\"")); }

  # This does not work, unfortunately:
  # # Patterns for lowecase pinyin syllable 
  # # with numeric tone and disambiguating suffix:
  # pinyin_initial_pat = "([csz][h]|[bcdfghjklmnpqrstwxyz])?";
  # pinyin_middle_pat = "([aeiouüê]|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)";
  # pinyin_final_pat = "([n][g]?|[r])?";
  # pinyin_tone_pat = "[0-9]";
  # pinyin_token_pat = ( \
  #   "^" \
  #   pinyin_initial_pat pinyin_middle_pat pinyin_final_pat \
  #   pinyin_tone_pat \
  #   "$" \
  # );
}

function smp_is_good_token(smp,sec,type,wd)
{ 
  # Accept pinyin wd with numeric tone and disambiguating suffix:
  return smp_is_pinyin_token(wd);
}

function smp_is_pinyin_token(wd)
{
  return \
    (wd ~ /([csz][h]|[bcdfghjklmnpqrstwxyz])?([aeiou]|ü|ê|a[io]|ei|i[aeou]|iao|ou|üe|u[aeio]|uai)([n][g]?|[r])?[0-9]([.][1-9][0-9]*)?$/)
}

function smp_reclassify_token(smp,sec,cursec,curlin,type,wd)
{
  # Delete anything outside section {sec}:
  if (sec == "tot.1")
    { if (cursec !~ /^{s[1-9][0-9]*}/) { return "n"; } }
  else  
    { arg_error(("bad subsection \"" subsec "\"")); }
  
  # Delete titles:
  if (cursec ~ /{(tt|cn)}$/) { return "n"; }
  
  # Reject any material that is not prose text:
  if (cursec !~ /^{s[1-9][0-9]*}$/) { return "x"; };

  # Discard punctuation other than parag breaks: 
  if ((type == "p") && (wd != "=")) { return "n"; }
  
  return type;
}

function smp_fix_token(smp,sec,type,wd)
{
  # Assume that the word table maps each GB byte-pair that corresponds to 
  # a Chinese character to a uniquified pinyin; GB codes of punctuation 
  # to strings in gullemots; and other GB codes
  # (symbols, non-Chinese letters, etc.) to strings in parentheses.
  if (wd ~ /^[a-zA-Z]/)
    { # Chinese character in pinyin.
      # reemap "u:" to "ü" and "e^" to "ê"
      wd = tolower(wd);
      gsub(/[u][:]/, "ü", wd);
      gsub(/[e][\^]/, "ê", wd);
      # Break at hyphens:
      gsub(/[-]/, " ", wd);
    }
  return wd;
}