#! /usr/bin/gawk -f
# Last edited on 2004-07-15 23:37:55 by stolfi

BEGIN {
  abort = -1;

  # Validates the format of the new split and unfolded interlinear file.

  usage = ( \
      "validate-new-evt-format \\\n" \
      "  [ -v chars=CHARS ] \\\n" \
      "  [ -v location=LOC ] \\\n" \
      "  [ -v checkTerminators=1 ] ] \\\n" \
      "  [ -v checkLineLengths=1 ] \\\n" \
      "  [ -v requireUnitHeaders=0 ] \\\n" \
      "  [ -v requirePageHeaders=0 ] \\\n" \
      "  < INFILE > OUTFILE " \
    );

  # where CHARS are the allowed non-space letters, and LOC is a
  # location code without line number e.g. "f103v2.T1" or "f83r"

  if (chars == "") 
    { chars = "\"'abcdefghijklmnopqrstuvxyzAEFIKOPSTY"; }
  if (chars ~ /[-=,.]/) 
    { arg_error("invalid characters in \"chars\" list\n"); }
    
  if (checkTerminators == "")   { checkTerminators = 0; }
  if (checkLineLengths == "")   { checkLineLengths = 0; }
  if (requireUnitHeaders == "") { requireUnitHeaders = 1; }
  if (requirePageHeaders == "") { requirePageHeaders = 1; }
    
  # The "location" parameter is used to check ## <..> lines
  # exclusively. The values of 'fnum' and "unit" are extracted from
  # those lines and used to check text lines.
    
  # Location fields of previous line:
  old_fn = ""; old_un = ""; old_ln = "";
  
  nerrors = 0;
}

//{ if (abort >= 0) { exit abort; } }

# blank line
/^ *$/ {
  next;
}

# `##'-comment (page/unit header)
/^##/ {
  gsub(/^## */, "", $0);
  if (! check_page_header($0)) { print_line(); nerrors++; }
  next;
}

# other `#'-comment
/^#/ {
  next;
}

# VTT-style page/unit header
/^<[^;<>]*>/ {
  if (! check_page_header($0)) { print_line(); nerrors++; }
  next;
}

# Text line
/^</ {
  if (! check_text_line($0)) { print_line(); nerrors++; }
  next
}

# Other lines
/./ {
  format_error("bad line format"); 
  print_line(); nerrors++;
  next
}

END {
  if (abort >= 0) { printf "aborted\n" > "/dev/stderr"; exit abort; }
  printf "\n" > "/dev/stderr";
  printf "%d errors flagged\n", nerrors > "/dev/stderr";
}
  

function check_text_line(lin,  txt,loc,tmp,fn,un,tr,nf,res)
{
  res = 1;

  if (length(lin) <= 19)
    { format_error("missing text"); res = 0; }
  
  # Check general format, and extract location code and text proper.
  # Note that line number must start with digit,
  # while the location code must start with letter:
  match(lin, /^<f[0-9]+[vr]?[0-9]?[.][A-Za-z][A-Za-z0-9]*[.][0-9]+[a-e]?;[A-Z]>/);
  if (RSTART != 1) 
    { format_error("bad location format"); return 0; }
 
  loc = substr(lin,RSTART+1,RLENGTH-2);
  txt = substr(lin,RLENGTH+1);
  if (substr(lin,RLENGTH+1, 19-RLENGTH) != substr("                    ", 1, 19-RLENGTH))
    { format_error("too few blanks"); res = 0; }
  if (substr(lin,20,1) == " ")
    { format_error("too many blanks"); res = 0; }

  gsub(/^[ ]+/, "", txt);
  gsub(/[ ]+$/, "", txt);

  # Validate location code
  # Split location into fields:
  tmp = length(loc);
  tr = substr(loc, tmp,1);
  if (substr(loc, tmp-1, 1) != ";")  { fatal_error("program error"); }
  loc = substr(loc, i, tmp-2);
  nf = split(loc, tmp, /[.]/);
  if (nf != 3) { fatal_error("program error"); }
  fn = tmp[1]; un = tmp[2]; ln = tmp[3];
  
  # Check page f-number:
  if (requirePageHeaders)
    { if (fnum == "")
        { format_error("missing page header line"); res = 0;
          fnum = fn;
        }
      else
        { if (fn != fnum)
           { format_error(("wrong page f-number, should have been " fnum)); res = 0; }
        }
    }
  else
    { fnum = fn; }

  if (requireUnitHeaders)
    { # Check unit tag:
      if (unit == "")
        { format_error("missing unit header line"); res = 0;
          unit = un;
        }
      else
        { if (un != unit)
           { format_error(("wrong unit code, should have been " unit)); res = 0; }
        }
     }
   else
     { unit = un; }

  # Convert line numbers to pure number
  if (match(ln, /[0-9]$/)) 
    { # Append a "0"
      ln = (ln "0");
    }
  else
    { # Convert the final letter to a digit:
      gsub(/[a]$/, "1", ln);
      gsub(/[b]$/, "2", ln);
      gsub(/[c]$/, "3", ln);
      gsub(/[d]$/, "4", ln);
      gsub(/[e]$/, "5", ln);
    }
  if (! match(ln, /^[0-9][0-9]*$/)) { fatal_error("program error"); }

  # printf "[%s][%s][%s] -> [%s][%s][%s][%s]", \
  #   old_fn, old_un, old_ln, fn, un, ln, tr > "/dev/stderr";
    
  # Check for non-decreasing line numbers:
  if ((fn == old_fn) && (un == old_un))
    { if ((ln + 0) < (old_ln + 0)) 
        { format_error("lines out of order"); res = 0; }
    }

  # Check for repeated transcription code:
  if ((fn == old_fn) && (un == old_un) && (ln == old_ln))
    { if (tr in tr_seen) 
        { format_error("repeated transcription code"); res = 0; }
    }
  else
    { split("", tr_seen); }
  
  tr_seen[tr] = 1;
  
  # Validate line length
  
  if (checkLineLengths)
    { 
      # Remove trailing comments and fillers, if any
      while (gsub(/{[^{}]*}$/, "", txt)) { }
      nc = length(txt);
      if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) 
        { if ((old_nc != -1) && (nc != old_nc))
            { format_error(("inconsistent line lengths (" old_nc ":" nc ")"));
              res = 0;
            }
        }
      old_nc = nc;
    }
  
  
  # Validate text proper
  
  # Remove '{}' comments
  gsub(/{[^{}]*}/, "", txt);
  
  # Ignore trailing blanks:
  gsub(/  *$/, "", txt);
  
  # Assume the '[|]' groups have been unfolded, 
  # otherwise we should do this:
  # gsub(/\[[-*%A-Z.24678]*[|][-*%A-Z.24678]*\]/, "", txt);
  
  # Remove non-significant fillers [!] but leave skip-markers [%]:
  gsub(/[!]/, "", txt);
  
  if (txt == "") 
    { 
      # Empty lines are OK.
    }
  else
    { 
      # Remove weirdoes
      gsub(/[&][0-9][0-9][0-9];/, "*", txt);
      # Check for leading or double word breaks 
      # (one trailing word break is OK, e.g. in circular text.)
      if (txt ~ /^[-.,]./)
        { format_error("leading [-.,]"); res = 0; }
      if (txt !~ ("^[-.,*" chars "]*.$"))
        { format_error("invalid char in text"); res = 0; }
      if (txt ~ /[-.,][-.,]/)
        { format_error("doubled [-.,]"); res = 0; }
      if (txt !~ /[-=.,]$/)
        { format_error("text should end with [-=.,]"); res = 0; }
      else 
        { cr = substr(txt, length(txt), 1);
          if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) 
            { if ((old_cr != "") && (cr != old_cr) && (checkTerminators))
                { format_error(("inconsistent line terminator (" old_cr ":" cr ")"));
                  res = 0;
                }
            }
          old_cr = cr;
        }
    }

  old_fn = fn; old_un = un; old_ln = ln;

  return res;
}

function check_page_header(lin,   fn,att,fld,n,i,res)
{
  res = 1;
  
  if (! match(lin, /^<[^<>{}]*> *[{][^{}<>]*[}] *$/))
    { format_error("bad page locator line"); return 0; }
  
  # Extract and check location code
  if (! match(lin, /<.*>/)) { fatal_error("program error"); }
  loc = substr(lin, RSTART+1, RLENGTH-2);
  if (location != "") 
    { if (fn != req_fnum) 
        { format_error(("wrong page f-number, should have been" req_fnum)); res = 0; } 
    }

  # decompose location code in "fnum" and "unit"
  if (match(loc, /[.][A-Za-z0-9]+$/))
    { unit = substr(loc, RSTART+1, RLENGTH-1);
      fnum = substr(loc, 1, RSTART-1);
    }
  else
    { unit = ""; fnum = loc; }

  # Check page f-number, save in "fnum":
  if (! match(fnum, /^f[0-9]+[vr]?[0-9]?$/))
    { format_error("bad f-number in ##-header"); res = 0; }
  
  # Check attribute list
  match(lin, /{.*}/);
  if (RSTART == 0) { fatal_error("program error"); }
  att = substr(lin, RSTART+1, RLENGTH-2);
  n = split(att, fld, " ");
  for (i=1;i<=n;i++)
    { if (! match(fld[i], /^ *[$][A-Z][=][A-Z0-9] *$/))
        { format_error("bad page attribute"); res = 0; }
    }
  return res;
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function fatal_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}

function print_line()
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, $0 > "/dev/stderr";
  printf "\n" > "/dev/stderr";
}