#! /usr/bin/gawk -f
# Last edited on 1998-12-29 05:36:48 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
      "extract-reading-tuples \\\n" \
      "  < INFILE.evt > OUTFILE.tup " \
    );
    
  # Reads an interlinear file in EVMT format (EVA encoding) and writes
  # a list of 26-character tuples, one for each character position
  # present in the interlinear.
  #
  # The tuple for a given character position consists of the readings
  # of that position by all 26 (potential) transcribers, "A" thru "Z".
  # The  "%" reading is assumed whenever a character position is not
  # covered by a particular transcription.
  #
  # The spaces, line breaks, para breaks, and the fillers "!" and "%"
  # are viewed as readings, too. In-line comments are replaced by "!"
  # fillers, preserving alignment.
    
  trset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  bangs = "!"; while (length(bangs) < 500) { bangs = (bangs bangs); }
  blanks = " "; while (length(blanks) < 19) { blanks = (blanks blanks); }

  # Location fields of current line:
  cur_fn = ""; cur_un = ""; cur_ln = "";
  cur_nc = -1; # Current line length.
  cur_ok = 1;  # means that all versions of current line have been processed.
  
  # Versions of the current line
  cur_nt = 0;         # number of versions read so far. 
  split("", cur_tx);  # "cur_tx[i]", "i=0..cur_nt-1" are the versions, in reading order.
  split("", cur_tk);  # "cur_tk[i]" are the corresponding transcriber codes, 1..26.
  split("", cur_ts);  # "cur_ts[tr] = 1" means transcriber "tr" has occurred.
  
  # Various counts:
  n_lines = 0; # VMS text lines.
  n_read = 0;  # Interlinear text lines read.
  n_used = 0;  # Interlinear text lines used in tuples.

  # tuple for output
  split("", tup);
}

//{ if (abort >= 0) { exit abort; } }

# Blank line
/^ *$/ {
  next;
}

# `##'-comment (page/unit header)
/^[#][#]/ {
  process_current_line();
  next;
}

# Other `#'-comment
/^[#]/ {
  process_current_line();
  next;
}

# Uncommented page/unit header
/^<[^<>;]*>/ {
  process_current_line();
  next;
}

# Text line
/^</ {
  if (! process_variant($0)) { print_line($0); }
  next;
}

# Other lines
/./ {
  format_error("bad line format"); 
  print_line($0);
  next;
}

END {
  if (abort >=0) { exit abort; }
  process_current_line();
  printf "%7d interlinear text lines read\n", n_read > "/dev/stderr";
  printf "%7d interlinear text lines used\n", n_used > "/dev/stderr";
  printf "%7d VMS lines processed\n", n_lines > "/dev/stderr";
}

function process_variant(lin,  txt,loc,tmp,fn,un,tc,tk,nf,res)
{
  res = 1;
  n_read++;

  if (length(lin) <= 19)
    { format_error("missing text"); res = 0; }
  
  # Check general format, and extract location code and text proper.
  # Note that line number must start with digit,
  # while the location code must start with letter:
  match(lin, /^<f[0-9]+[vr]?[0-9]?[.](|[A-Za-z][A-Za-z0-9]*[.])[0-9]+[a-e]?;[A-Z]>/);
  if (RSTART != 1) 
    { format_error("bad location format"); res = 0; }
  else
    { 
      loc = substr(lin,RSTART+1,RLENGTH-2);
      txt = substr(lin,RLENGTH+1);
      if (substr(lin,RLENGTH+1, 19-RLENGTH) != substr(blanks, 1, 19-RLENGTH))
        { format_error("too few blanks"); res = 0; }
      if (substr(lin,20,1) == " ")
        { format_error("too many blanks"); res = 0; }

      gsub(/^[ ]+/, "", txt);
      gsub(/[ ]+$/, "", txt);

      # Validate location code
      # Split location into fields:
      tmp = length(loc);
      tc = substr(loc, tmp,1);
      if (substr(loc, tmp-1, 1) != ";")  { fatal_error("program error: semicolon"); }
      loc = substr(loc, i, tmp-2);
      nf = split(loc, tmp, /[.]/);
      if (nf == 3) 
        { fn = tmp[1]; un = tmp[2]; ln = tmp[3]; }
      else if (nf == 2) 
        { fn = tmp[1]; un = "-"; ln = tmp[2]; }
      else
        { fatal_error("program error: nf"); }

      # Remove trailing comments, if any, and save line length:
      while (gsub(/{[^{}]*}$/, "", txt)) { }
      nc = length(txt);
      
      # Convert any remaining comments to "!"s:
      txt = remove_comments(txt);
      
      # Check for non-decreasing lines
      if ((fn == cur_fn) && (un == cur_un) && (cur_ok == 1))
        { if (numeric_line(ln) <= numeric_line(cur_ln))
            { format_error("lines out of order, or interloping comments"); }
        }

      # If new line, flush previous one:
      if ((fn != cur_fn) || (un != cur_un) || (ln != cur_ln))
        { process_current_line(); }
      else
        { if ((cur_nc != -1) && (nc != cur_nc))
            { format_error(("inconsistent line lengths (" cur_nc ":" nc ")"));
              res = 0;
            }
        }

      # Save this version:
      if (tc in cur_ts) 
        { format_error("repeated transcription code"); res = 0; }
      else
        { cur_ts[tc] = 1;
          if (res == 1)
            { # Save for later tuple extraction:
              cur_tx[cur_nt] = txt; 
              tk = index(trset, tc);
              if (tk == 0) { fatal_error("program error: tr"); }
              cur_tk[cur_nt] = tk; 
              cur_nt++;
              n_used++;
            }
        }

      # Update current location and line length:
      cur_fn = fn; cur_un = un; cur_ln = ln; cur_nc = nc;
    }

  cur_ok = 0;
  return res;
}

function remove_comments(txt,   chunk,i,res)
{
  # Replaces {}-comments by an equal length of "!"s
  
  res = "";
  while (txt != "")
    { i = index(txt, "{");
      if (i == 0) 
        { res = (res txt); txt = ""; } 
      else 
        { res = (res substr(txt, 1, i-1)); 
          txt = substr(txt, i);
          i = index(txt, "}");
          if (i == 0) 
            { format-error("mismatched `{'"); res = (res txt); txt = ""; }
          res = (res substr(bangs, 1,i)); 
          txt = substr(txt, i+1);
        }
    }
  return res;
}

function process_current_line(    i,loc,lin)
{
  if (cur_ok != 1) { 
    extract_tuples(cur_tx, cur_tk, cur_nt);
    n_lines++;
  }
  split("", cur_tx); split("", cur_tk); split("", cur_ts); cur_nt = 0; 
  cur_ok = 1;
}

function extract_tuples(cur_tx, cur_tk, cur_nt,  i,j,new,nc,m,c,d,and,tup)
{
  # For each character position, creates a tuple from "cur_tx[i]", "i=0..cur_nt-1",
  # and writes it to standard output.
  
  # Check lengths, just to be sure:
  nc = length(cur_tx[0]);
  for (i=1; i<cur_nt; i++) 
    { if (cur_nc != length(cur_tx[i])) 
        { fatal_error(("diff lengths [" cur_tx[i] "]")); }
    }

  for (j=1; j<=nc; j++)
    { for (k=1;k<=26;k++) { tup[k] = "%"; }
      for (i=0; i<cur_nt; i++) 
        { d = substr(cur_tx[i], j,1);
          tup[cur_tk[i]] = d;
        }
      for (k=1;k<=26;k++) { printf "%s", tup[k]; }
      printf "\n";
    }
}

function numeric_line(n, m,i)
{
  # Converts a line number "n" to an integer, for sorting purposes.
  if (n ~ /^[0-9]+$/)
    { n = 10*n; }
  else if (n ~ /^[0-9]+[a-e]$/)
    { m = length(n);
      i = index("abcde", substr(n,m,1));
      if (i == 0) { fatal_error("program error: let->num"); }
      n = 10*substr(n,1,m-1) + i;
    }
  else
    { fatal_error("program error: line num"); }
  return n;
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function fatal_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}

function print_line(lin)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, lin > "/dev/stderr";
  printf "\n" > "/dev/stderr";
}