#! /usr/bin/gawk -f
# Usage: $0 < INFILE > OUTFILE
# Removes all EVMT comments and formatting.
# Leaves only the the Voynich text. 

BEGIN {abort = -1}

(abort >= 0) { exit abort; }

/^#/ { next; }

/./ {
  if (substr($0,1,1) == "<") 
    { skip = 19;
      loc = substr($0,1,19);
      gsub(/  *$/, "", loc);
      if ( loc !~ /^<f[0-9]+[vr]*[0-9]*\.[A-Za-z][A-Za-z0-9]*\.[0-9]+[a-z]*\;[A-Z]>$/ )
        { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" }
    } 
  else 
    { skip = 0;
      loc = "";
    }
  if (skip >= length($0)) { next; }
  txt = cleanup(substr($0,1+skip));
  print txt;
  next;
}

function cleanup(txt)
{
  # Removes crud from text
  
  # We discard  "%" and "!".
  gsub(/[ !]/, "", txt);
  gsub(/[%]/, " ", txt);
  
  # We discard "{}"-comments:
  gsub(/\{[^{}]*\}/, "", txt);
  
  # We choose arbitrarily the first of alternative transcriptions:
  gsub(/\[/, "", txt);
  gsub(/\|[^\]]*\]/, "", txt);
  gsub(/\]/, "", txt);
  
  return txt
}