#! /bin/gawk -f
# Last edited on 2004-02-18 10:38:26 by stolfi

BEGIN {
  abort = -1;
  usage = ( "reencode-words-greek-for-tex \\\n" \
    "  [ -v field=NUM ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Assumes that field number FIELD of the input is a 
  # word in ad-hoc Greek encoding, 
  # possibly factored into letters by braces "{}"
  # Allows both expanded and condensed encodings:
  #
  #   eh ë  (eta)   th ð  (theta)    ch q   (chi)
  #   ph f  (phi)   ow ô  (omega)    ps ç ß (psi)
  # 
  # Does not allow for breathings, final sigma, or other discritics.
  # Adds `\' in front of braces, maps letters to TeX math-greek sequences
  # (Should use Greek font instead, but...).

  if (field == "") { arg_error("must specify \"field\""); }
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { print; next; }

/./ { 
  if (NF < field) { data_error("bad NF"); }
  w = $(field);
  if (w !~ /^[-*{}a-ik-uwxzëôðçA-IK-UWXZËÔÐÇ]+$/)
    { data_error(("bad word \"" w "\"")); }
  w = reencode_greek_for_tex(w);
  $(field) = w;
  print; next;
}

function reencode_greek_for_tex(wd)
{
  w = wd;
  # Protect special characters:
  gsub(/[{]/,  "\\{", w);
  gsub(/[}]/,  "\\}", w);
  gsub(/[\#]/, "\\#", w);  # Just in case
  gsub(/[&]/,  "\\&", w);  # Just in case
  gsub(/[$]/,  "\\$", w);  # Just in case
  gsub(/[%]/,  "\\%", w);  # Just in case
  # Put a marker in front of each letter to identify unmapped ones:
  w = gensub(/(.)/, "·\\1", "g", w);
  # Map lowercase letters to TeX escapes:
  gsub(/[·][a]/,        "\\alpha", w);
  gsub(/[·][b]/,        "\\beta", w);
  gsub(/[·][c][·][h]/,  "\\chi", w);
  gsub(/[·][ç]/,        "\\psi", w);
  gsub(/[·][d]/,        "\\delta", w);
  gsub(/[·][e][·][h]/,  "\\eta", w);
  gsub(/[·][e]/,        "\\epsilon", w);
  gsub(/[·][ë]/,        "\\eta", w);
  gsub(/[·][f]/,        "\\psi", w);
  gsub(/[·][g]/,        "\\gamma", w);
  gsub(/[·][i]/,        "\\iota", w);
  gsub(/[·][k]/,        "\\kappa", w);
  gsub(/[·][l]/,        "\\lambda", w);
  gsub(/[·][m]/,        "\\mu", w);
  gsub(/[·][n]/,        "\\nu", w);
  gsub(/[·][o][·][w]/,  "\\omega", w);
  gsub(/[·][o]/,        "\\omicron", w);
  gsub(/[·][ô]/,        "\\omega", w);
  gsub(/[·][p][·][h]/,  "\\phi", w);
  gsub(/[·][p][·][s]/,  "\\psi", w);
  gsub(/[·][p]/,        "\\pi", w);
  gsub(/[·][q]/,        "\\chi", w);
  gsub(/[·][r]/,        "\\rho", w);
  gsub(/[·][s]/,        "\\sigma", w);
  gsub(/[·][t][·][h]/,  "\\theta", w);
  gsub(/[·][t]/,        "\\tau", w);
  gsub(/[·][ð]/,        "\\theta", w);
  gsub(/[·][u]/,        "\\upsilon", w);
  gsub(/[·][x]/,        "\\xi", w);
  gsub(/[·][z]/,        "\\zeta", w);
  # Ditto for uppercase 
  gsub(/[·][A]/,        "\\Alpha", w);
  gsub(/[·][B]/,        "\\Beta", w);
  gsub(/[·][C][·][hH]/, "\\Chi", w);
  gsub(/[·][Ç]/,        "\\Psi", w);
  gsub(/[·][D]/,        "\\Delta", w);
  gsub(/[·][E][·][hH]/, "\\Eta", w);
  gsub(/[·][E]/,        "\\Epsilon", w);
  gsub(/[·][Ë]/,        "\\Eta", w);
  gsub(/[·][F]/,        "\\Psi", w);
  gsub(/[·][G]/,        "\\Gamma", w);
  gsub(/[·][I]/,        "\\Iota", w);
  gsub(/[·][K]/,        "\\Kappa", w);
  gsub(/[·][L]/,        "\\Lambda", w);
  gsub(/[·][M]/,        "\\Mu", w);
  gsub(/[·][N]/,        "\\Nu", w);
  gsub(/[·][O][·][wW]/, "\\Omega", w);
  gsub(/[·][O]/,        "\\Omicron", w);
  gsub(/[·][Ô]/,        "\\Omega", w);
  gsub(/[·][P][·][hH]/, "\\Phi", w);
  gsub(/[·][P][·][sS]/, "\\Psi", w);
  gsub(/[·][P]/,        "\\Pi", w);
  gsub(/[·][Q]/,        "\\Chi", w);
  gsub(/[·][R]/,        "\\Rho", w);
  gsub(/[·][S]/,        "\\Sigma", w);
  gsub(/[·][T][·][hH]/, "\\Theta", w);
  gsub(/[·][T]/,        "\\Tau", w);
  gsub(/[·][Ð]/,        "\\Theta", w);
  gsub(/[·][U]/,        "\\Upsilon", w);
  gsub(/[·][X]/,        "\\Xi", w);
  gsub(/[·][Z]/,        "\\Zeta", w);
  # Sanity check:
  if (w ~ /[·]/) { data_error(("bad word \"" wd "\"")); }
  return w;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}