#! /usr/bin/gawk -f 

# Counts the number of Voynichese text characters in an EVT-like file

BEGIN {
  nBytes = 0
}

/^#/ { 
  next
}

function cleanup(txt)
{
  # Removes crud from text
  
  # We discard  "%" and "!".
  gsub(/[% !]/, "", txt);
  
  # We discard "{}"-comments:
  gsub(/\{[^}]*\}/, "", txt);
  
  # We choose arbitrarily the first of alternative transcriptions:
  gsub(/\[/, "", txt);
  gsub(/\|[^\]]*\]/, "", txt);
  gsub(/\]/, "", txt);
  
  return txt
}

/./ {
  if (substr($0,1,1) == "<") 
    { skip = 19;
      loc = substr($0,1,19);
      gsub(/  *$/, "", loc);
      if ( loc !~ /^<f[0-9]+[vr]*[0-9]*\.[A-Za-z][A-Za-z0-9]*\.[0-9]+[a-z]*\;[A-Z]>$/ )
        { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" }
    } 
  else 
    { skip = 0; }
  if (skip >= length($0)) next;
  txt = cleanup(substr($0,1+skip))
  nBytes += length(txt)
  next
}

END {
  print nBytes > "/dev/stderr"
  print nBytes
}