#! /usr/bin/gawk -f

# Recoding Voynich text (or interlinear file)
# from FSG to ECC (my error-tolerant lossy encoding)

BEGIN {
  pcmt = 1
}

function ecc(txt)
{
  # We discard  "%" and "!" since the conversion
  # will destroy synchronism anyway.
  gsub(/[% !]/, "", txt);
  
  # We discard comments:
  gsub(/\{[^}]*\}/, "", txt);
  
  # We choose arbitrarily the first of alternative transcriptions:
  gsub(/\[/, "", txt);
  gsub(/\|[^\]]*\]/, "", txt);
  gsub(/\]/, "", txt);
  
  # We also discard spaces ("." in the evt format),
  # since they are not reliable
  gsub(/[.]/, "", txt);
  
  # First, the conversion from FSG to JSA (Stolfi's super-analytic)
  gsub(/IIIK/, "iiiij",  txt);
  gsub(/IIIL/, "iiiiu",  txt);
  gsub(/IIIR/, "iiiis",  txt);
  gsub(/IIIE/, "iiiix",  txt);
  gsub(/IIE/,  "iiix",   txt);
  gsub(/IIR/,  "iiis",   txt);
  gsub(/IIK/,  "iiij",   txt);
  gsub(/HZ/,   "cqjc",   txt);
  gsub(/PZ/,   "cqgc",   txt);
  gsub(/DZ/,   "cljc",   txt);
  gsub(/FZ/,   "clgc",   txt);
  gsub(/IE/,   "iix",    txt);
  gsub(/IR/,   "iis",    txt);
  gsub(/IK/,   "iij",    txt);
  gsub(/2/,    "cs",     txt);
  gsub(/4/,    "q",      txt);
  gsub(/6/,    "cj",     txt);
  gsub(/7/,    "ig",     txt);
  gsub(/8/,    "cg",     txt);
  gsub(/A/,    "ci",     txt);
  gsub(/C/,    "c",      txt);
  gsub(/D/,    "lj",     txt);
  gsub(/E/,    "ix",     txt);
  gsub(/F/,    "lg",     txt);
  gsub(/G/,    "cy",     txt);
  gsub(/H/,    "qj",     txt);
  gsub(/I/,    "i",      txt);
  gsub(/K/,    "ij",     txt);
  gsub(/L/,    "iu",     txt);
  gsub(/M/,    "iiiu",   txt);
  gsub(/N/,    "iiu",    txt);
  gsub(/O/,    "o",      txt);
  gsub(/P/,    "qg",     txt);
  gsub(/R/,    "is",     txt);
  gsub(/S/,    "cc",     txt);  # Was "csc" in JSA
  gsub(/T/,    "cc",     txt);
  gsub(/V/,    "?",      txt);
  gsub(/Y/,    "?",      txt);
  
  # Now, the conversion from JSA to ECC:
  
  gsub(/[ql]j/, "H",     txt);
  gsub(/[ql]g/, "P",     txt);
  gsub(/ix/,    "e",     txt);
  gsub(/ij/,    "k",     txt);
  gsub(/is/,    "r",     txt);
  gsub(/iu/,    "m",     txt);
  gsub(/y/,     "i",     txt);
  gsub(/ci/,    "a",     txt);
  gsub(/cg/,    "8",     txt);
  gsub(/ig/,    "8",     txt);
  gsub(/cs/,    "r",     txt);
  gsub(/ii*e/,  "e",     txt);
  gsub(/ii*k/,  "k",     txt);
  gsub(/ii*r/,  "r",     txt);
  gsub(/ii*m/,  "m",     txt);
  gsub(/a/,     "o",     txt);
  
  return txt
}

/^ *$/ { print; next }

/^ *#/ { 
  if (pcmt)
    { print "# Output of fsg2ecc - Stolfi's error-tolerant encoding";
      pcmt = 0
    }
  print; next
}

/^<[^>]*> *$/ {
  print; next
}

/^</ {
  curtxt = substr($0,20)
  curtxt = ecc(curtxt)
  print (substr($0,1,19) curtxt);
  next
}

/./ {
  print ecc($0)
}