#! /usr/bin/gawk -f 
# Last edited on 2001-01-02 01:54:46 by stolfi
#
# Replaces 8-bit weirdo codes by "*{&NNN}" where NNN is the decimal
# code of the weirdo.
#
# Also replaces any capitalized (ligated) EVA letter "C" by "c{&C}"
# where "c" is the lower-case equivalent.
#
# See also "remove-needless-capitalization"

BEGIN {
  abort = -1;
  
  # Recoding table:

  tbl["A"] = "a{&A}";
  tbl["E"] = "e{&E}";
  tbl["F"] = "f{&F}";
  tbl["H"] = "h{&H}";
  tbl["I"] = "i{&I}";
  tbl["K"] = "k{&K}";
  tbl["O"] = "o{&O}";
  tbl["P"] = "p{&P}";
  tbl["S"] = "s{&S}";
  tbl["R"] = "r{&R}";
  tbl["T"] = "t{&T}";
  tbl["Y"] = "y{&Y}";
  
  tbl["c'"]  = "s{&c'}";
  tbl["e'"]  = "s{&e'}";
  tbl["I'"]  = "r{&I'}";
  tbl["I\""] = "r{&I\"}";
  tbl["o'"]  = "o{&o'}";
  tbl["O'"]  = "o{&O'}";
  tbl["O\""] = "o{&O\"}";
  tbl["q\""] = "q{&q\"}";
  tbl["y'"]  = "y{&y'}";

  tbl["\202"] = "*{&130}";
  tbl["\203"] = "*{&131}";
  tbl["\204"] = "*{&132}";
  tbl["\205"] = "*{&133}";
  tbl["\206"] = "*{&134}";
  tbl["\207"] = "*{&135}";

  tbl["\210"] = "*{&136}";
  tbl["\211"] = "k{&137}";
  tbl["\212"] = "*{&138}";
  tbl["\213"] = "*{&139}";
  tbl["\214"] = "*{&140}";
  tbl["\215"] = "*{&141}";
  tbl["\216"] = "*{&142}";
  tbl["\217"] = "r{&143}";
  
  tbl["\220"] = "*{&144}";
  tbl["\221"] = "*{&145}";
  tbl["\222"] = "*{&146}";
  tbl["\223"] = "*{&147}";
  tbl["\224"] = "*{&148}";
  tbl["\225"] = "*{&149}";
  tbl["\226"] = "c{&150}";
  tbl["\227"] = "*{&151}";
  
  tbl["\230"] = "*{&152}";
  tbl["\231"] = "*{&153}";
  tbl["\232"] = "*{&154}";
  tbl["\233"] = "*{&155}";
  tbl["\234"] = "*{&156}";
  tbl["\235"] = "*{&157}";
  tbl["\236"] = "*{&158}";
  tbl["\237"] = "*{&159}";

  # "\240"=&160 is   
  
  tbl["\241"] = "*{&161}"; #   "¡"
  tbl["\242"] = "k{&162}"; #    "¢"
  tbl["\243"] = "*{&163}"; #   "£"
  tbl["\244"] = "r{&164}"; #  "¤"
  tbl["\245"] = "m{&165}"; #     "¥"
  tbl["\246"] = "*{&166}"; #  "¦"
  tbl["\247"] = "q{&167}"; #    "§"

  tbl["\250"] = "*{&168}"; #     "¨"
  tbl["\251"] = "*{&169}"; #    "©"
  tbl["\252"] = "*{&170}"; #    "ª"
  tbl["\253"] = "*{&171}"; #   "«"
  tbl["\254"] = "r{&172}"; #     "¬"
  tbl["\255"] = "y{&173}"; #     "­"
  tbl["\256"] = "*{&174}"; #     "®"
  tbl["\257"] = "*{&175}"; #    "¯"
  tbl["\260"] = "k{&176}"; #     "°"
  tbl["\261"] = "p{&177}"; #  "±"
  tbl["\262"] = "t{&178}"; #    "²"
  tbl["\263"] = "p{&179}"; #    "³"
  tbl["\264"] = "p{&180}"; #   "´"
  tbl["\265"] = "p{&181}"; #   "µ"
  tbl["\266"] = "*{&182}"; #    "¶"
  tbl["\267"] = "*{&183}"; #  "·"
  tbl["\270"] = "f{&184}"; #   "¸"
  tbl["\271"] = "x{&185}"; #    "¹"
  tbl["\272"] = "p{&186}"; #    "º"
  tbl["\273"] = "*{&187}"; #   "»"
  tbl["\274"] = "*{&188}"; #  "¼"
  tbl["\275"] = "y{&189}"; #  "½"
  tbl["\276"] = "*{&190}"; #  "¾"
  tbl["\277"] = "*{&191}"; #  "¿"
  tbl["\300"] = "*{&192}"; #  "À"
  tbl["\301"] = "*{&193}"; #  "Á"
  tbl["\302"] = "*{&194}"; #   "Â"
  tbl["\303"] = "*{&195}"; #  "Ã"
  tbl["\304"] = "*{&196}"; #    "Ä"
  tbl["\305"] = "*{&197}"; #   "Å"
  tbl["\306"] = "*{&198}"; #   "Æ"
  tbl["\307"] = "*{&199}"; #  "Ç"
  tbl["\310"] = "h{&200}"; #  "È"
  tbl["\311"] = "d{&201}"; #  "É"
  tbl["\312"] = "t{&201}"; #   "Ê"
  tbl["\313"] = "*{&203}"; #    "Ë"
  tbl["\314"] = "*{&204}"; #  "Ì"
  
  # "\314"=&205 not in use 
  
  tbl["\316"] = "*{&206}"; #   "Î"
  tbl["\317"] = "*{&207}"; #    "Ï"
  tbl["\320"] = "*{&208}"; #     "Ð"
  tbl["\321"] = "*{&209}"; #  "Ñ"
  tbl["\322"] = "*{&210}"; #  "Ò"
  tbl["\323"] = "*{&211}"; #  "Ó"
  tbl["\324"] = "*{&212}"; #   "Ô"
  tbl["\325"] = "*{&213}"; #  "Õ"
  tbl["\326"] = "*{&214}"; #    "Ö"
  tbl["\327"] = "t{&215}"; #   "×"
  tbl["\330"] = "*{&216}"; #  "Ø"
  
  # "\331"=&217 to "\373"=&251 not in use
  
  tbl["\374"] = "*{&252}"; #    "ü"
  tbl["\375"] = "*{&253}"; #  "ý"
  tbl["\376"] = "*{&254}"; #   "þ"
  tbl["\377"] = "*{&255}"; #    "ÿ"
  
  for (c in tbl)
    { d = tbl[c]; 
      if(match(d, /^.{[&][0-9][0-9][0-9]}$/))
        { d = substr(d,4,3); oct[d] = c; }
    }
}

//{ if (abort >= 0) { exit abort; } }

/^[#]/ { print; next; }

/^[<]f[0-9]+[rv]?[0-6]?(|[.][A-Za-z][A-Za-z0-9]?)[>]/ { 
  # Page/unit header lines
  print; next;
}

/^[<]/ {
  # Normal EVMT-format text lines
  lin = $0;
  p = index(lin, ">");
  if (p == 0) { format_error("unmatched `<'"); }
  
  loc = substr(lin,1,p);
  txt = substr(lin, p+1);
  gsub(/^[ ]*/, "", txt);
  printf "%-18s %s\n", loc, basify_text(txt);
  next;
}

// {
  # Non-EVMT text lines
  print basify_text($0);
}
  
function basify_text(txt,    k,cmt,chunk,res,oldchunk)
{
  res = "";
  while (txt != "") 
    { # break off from "txt" a comment-free chunk and the following comment:
      k = index(txt, "{");
      if (k == 0)
        { chunk = txt; cmt = ""; txt = ""; } 
      else 
        { chunk = substr(txt, 1, k-1);
          txt = substr(txt, k);
          k = index(txt, "}");
          if (k == 0) { format_error("unclosed `{'"); }
          cmt = substr(txt, 1, k);
          txt = substr(txt, k+1);
        }
      
      # Basify the chunk and add it to the result
      if (match(chunk, /[^-.,=*a-z]/)) { chunk = basify_chunk(chunk); }
      res = (res chunk cmt);
    }
  return (res);
}

function basify_chunk(txt,  n,i,c,d,skip,res)
{
  if (! match(txt, /[^-a-vx-z,.=*?!%]/)) { return txt; }
  res = "";
  n = length(txt);
  for (i=1;i<=n;i++)
    { c = substr(txt,i,1);
      if ((c < "\040") || (c > "\176")) 
        { if (c in tbl)
            { c = tbl[c]; }
          else
            { format_error("invalid 8-bit char"); }
        }
      else if ((c == "&") && match(substr(txt,i), /^[&][#]?([0-2][0-9][0-9])[;]?/))
        { skip = RLENGTH-1; c = substr(txt,i+1,skip); 
          gsub(/[#]/, "", c); gsub(/[;]$/, "", c); 
          if (c in oct)
            { c = tbl[oct[c]]; i += skip; }
          else
            { format_error("invalid &-code"); }
        }
      else 
        { if ((i < n) && (index("'\"", (d = substr(txt,i+1,1))) != 0))
            { c = (c d); i++; }
          if (c in tbl) { c = tbl[c]; }
        }
      res = (res c);
    }
  return (res);
}

function format_error(msg)
{ 
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1;
}