#! /bin/gawk -f
# Last edited on 2012-05-05 19:53:44 by stolfilocal

BEGIN {
  abort = -1;
  usage = ( "reencode-words-pinyin-for-tex \\\n" \
    "  [ -v field=NUM ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Assumes that field number FIELD of the input is a 
  # word in Chinese pinyin encoding, with numeric suffix tones, 
  # possibly factored into letters by braces "{}",
  # and possibly followed by "." and a numeric disambiguating code.
  # Allows for both "ü"/"ê" and "u:"/"e^" convention.
  # Adds `\' in front of some characters to allow typesetting 
  # in TeX (with proper fonts).

  if (field == "") { arg_error("must specify \"field\""); }
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { print; next; }

/./ { 
  if (NF < field) { data_error("bad NF"); }
  w = $(field);
  if (w !~ /^([-*{}a-zA-Z0-9?:^.]|ü|ë|Ü|Ë)+$/)
    { data_error(("bad word \"" w "\"")); }
  w = reencode_pinyin_for_tex(w);
  $(field) = w;
  print; next;
}

function reencode_pinyin_for_tex(w)
{
  # Protect special characters:
  gsub(/[{]/, "\\{", w);
  gsub(/[}]/, "\\}", w);
  gsub(/[#]/, "\\#", w);  # Just in case
  gsub(/[&]/, "\\&", w);  # Just in case
  gsub(/[$]/, "\\$", w);  # Just in case
  gsub(/[%]/, "\\%", w);  # Just in case
  # Change special letters to TeX accents:
  gsub(/[e][\^]/, "{\\^e}", w);
  gsub(/[u][:]/, "{\\\"u}", w);
  gsub(/[ü]/, "{\\\"u}", w);
  gsub(/[ê]/, "{\\^e}", w);
  gsub(/[E][\^]/, "{\\^E}", w);
  gsub(/[U][:]/, "{\\\"U}", w);
  gsub(/[Ü]/, "{\\\"U}", w);
  gsub(/[Ê]/, "{\\^E}", w);
  # Mark tone and suffix for possible special processing:
  w = gensub(/([0-9])[.]([0-9]+)$/, "\\\\tn{\\1}{\\2}", "", w);
  w = gensub(/[.]([0-9]+)$/, "\\\\tn{}{\\1}", "", w);
  w = gensub(/([0-9])$/, "\\\\tn{\\1}{}", "", w);
  w = gensub(/([^{}0-9])$/, "\\1\\\\tn{}{}", "", w);
  return w;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}