#! /usr/bin/gawk -f
# Last edited on 1999-01-29 21:04:55 by stolfi

# Usage: $0 < INFILE > OUTFILE
# Removes all comments from an evt-like file.
# Chooses the first alternative of every "[...|...|...]" group.
# Leaves only the location code (if any) and the Voynich text. 

BEGIN {abort = -1;}

(abort >= 0) { exit abort; }

/^#/ { next; }

/<[^.;]*>/{ print; next; }

/./ {
  if (substr($0,1,1) == "<") 
    { skip = 19;
      loc = substr($0,1,19);
      gsub(/  *$/, "", loc);
      if ( loc !~ /^<f[0-9]+[vr]*[0-9]*\.[A-Za-z][A-Za-z0-9]*\.[0-9]+[a-z]*\;[A-Z]>$/ )
        { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" }
    } 
  else 
    { skip = 0;
      loc = ("<f0.T." FNR ";X>");
    }
  if (skip >= length($0)) next;
  txt = cleanup(substr($0,1+skip));
  printf "%-19s%s\n", loc, txt;
  next;
}

function cleanup(txt)
{
  # Removes crud from text
  
  # We discard  "%" and "!".
  gsub(/[ !]/, "", txt);
  gsub(/[%]/, " ", txt);
  
  # We discard "{}"-comments:
  gsub(/\{[^}]*\}/, "", txt);
  
  # We choose arbitrarily the first of alternative transcriptions:
  gsub(/\[/, "", txt);
  gsub(/\|[^\]]*\]/, "", txt);
  gsub(/\]/, "", txt);
  
  return txt
}