#! /usr/bin/gawk -f
# Last edited on 1998-12-30 11:39:59 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
      "combine-versions \\\n" \
      "  -v ignore=CHARS \\\n" \
      "  -v code=CHAR \\\n" \
      "  -v position=[first|last] \\\n" \
      "  -v table=TBLFILE \\\n" \
      "  < INFILE.evt > OUTFILE.evt " \
    );
    
  # Reads an interlinear file in EVMT format (EVA encoding) and 
  # writes the same with an extra version, with transcriber code
  # "code". 
  #
  # The new version is computed by table lookup, using a table TBLFILE
  # that maps "reading tuples" to a single character. A reading tuple
  # for a given character position consists of the readings of that
  # position by all 26 (potential) transcribers, "A" thru "Z", encoded
  # as a string of 26 EVA letters. The "%" reading is assumed whenever
  # a character position is not covered by a particular transcription,
  # of for transcribers listed in the "ignore" list
  #
  # The spaces, line breaks, para breaks, and the fillers "!" and "%"
  # are viewed as readings, too. In-line comments are replaced by "!"
  # fillers, preserving alignment, before extracting the reading tuples.
    
  trset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  bangs = "!"; while (length(bangs) < 500) { bangs = (bangs bangs); }
  blanks = " "; while (length(blanks) < 19) { blanks = (blanks blanks); }

  if (code !~ /^[A-Z]$/) 
    { arg_error("bad or missing \"code\" argument"); }
  if ((position != "first") && (position != "last"))
    { arg_error("bad or missing \"position\" argument"); }
  if (ignore !~ /^[A-Z]*$/) 
    { arg_error("bad \"ignore\" argument"); }
  if (table == "") 
    { arg_error("must specify the \"table\" argument"); }
    
  tup_clear_current_batch();
  
  # Read weight table "wt[i]" where "i" is 1..26:
  nentries = 0;
  while (getline lin < table)
    { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) 
        { n = split(lin, fld);
          if (n != 2) { table_error("bad number of fields"); }
          if (length(fld[1]) != 26) { table_error("bad tuple"); }
          if (length(fld[2]) != 1) { table_error("bad output char"); }
          out[fld[1]] = fld[2];
          nentries++;
        }
    }
  close(table);
  if (nentries == 0) { arg_error("no entries in weight table"); }
  
}

//{ if (abort >= 0) { exit abort; } }

# Blank line
/^ *$/ {
  next;
}

# `##'-comment (page/unit header)
/^[#][#]/ {
  tup_process_current_batch(ignore);
  print;
  next;
}

# Other `#'-comment
/^[#]/ {
  tup_append_line_to_batch($0);
  next;
}

# Uncommented page/unit header
/^<[^<>;]*>/ {
  tup_process_current_batch(ignore);
  next;
}

# Text line
/^</ {
  tup_process_variant($0, ignore)
  next;
}

# Other lines
/./ {
  format_error("bad line format"); 
  print_line($0);
  next;
}

END {
  if (abort >=0) { exit abort; }
  tup_process_current_batch(ignore);
}

# Client functions called by tup_process_current_batch:

function process_batch_texts(loc,txt,trn,nv,nc, tuple)
{
  # Called with the cleaned-up texts, without comments.
  # Extract the tuples and build the combined version:
  split("", tuple);
  tup_extract_tuples(txt,trn,nv,nc,tuple);
  build_combined_version(loc,tuple,nc);
  split("", tuple);  
}

function build_combined_version(loc,tuple,nc,   j,tj,txt)
{
  # Build the combined version:
  txt = "";
  for (j=1; j<=nc; j++) { 
    tj = tuple[j];
    if (! (tj in out)) { fatal_error(("tuple \"" tj "\" not in table")); }
    txt = (txt out[tj]);
  }
  lin = sprintf("%-18.18s %s", ("<" loc ";" code ">"), txt);
  if (position == "first")
    { tup_prepend_version_to_batch(lin); }
  else
    { tup_append_version_to_batch(lin); }
}

function process_batch_lines(batch, nb)
{
  # Called with the original (and new) lines, including comments.
  # Output them:
  for (i=0; i<nb; i++) { print batch[i]; }
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function fatal_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}

function print_line(lin)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, lin > "/dev/stderr";
  printf "\n" > "/dev/stderr";
}