#! /usr/bin/gawk -f
# Last edited on 1998-12-30 11:36:14 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
      "extract-reading-tuples \\\n" \
      "  -f tuple-procs.gawk \\\n" \
      "  < INFILE.evt > OUTFILE.tup " \
    );
    
  # Reads an interlinear file in EVMT format (EVA encoding) and writes
  # a list of 26-character tuples, one for each character position
  # present in the interlinear.
  #
  # The tuple for a given character position consists of the readings
  # of that position by all 26 (potential) transcribers, "A" thru "Z".
  # The  "%" reading is assumed whenever a character position is not
  # covered by a particular transcription.
  #
  # The spaces, line breaks, para breaks, and the fillers "!" and "%"
  # are viewed as readings, too. In-line comments are replaced by "!"
  # fillers, preserving alignment.
    
  tup_clear_current_batch();
  
  # Various counts:
  n_lines = 0;     # VMS text lines.
  n_variants = 0;  # Interlinear text lines read.
  n_used = 0;      # Interlinear text lines used in tuples.
  n_tuples = 0;    # Tuples written
}

//{ if (abort >= 0) { exit abort; } }

# Blank line
/^ *$/ {
  next;
}

# `##'-comment (page/unit header)
/^[#][#]/ {
  tup_process_current_batch("");
  next;
}

# Other `#'-comment
/^[#]/ {
  next;
}

# Uncommented page/unit header
/^<[^<>;]*>/ {
  tup_process_current_batch("");
  next;
}

# Text line
/^</ {
  tup_process_variant($0, "")
  n_variants++;
  next;
}

# Other lines
/./ {
  fatal_error("bad line format"); 
}

END {
  if (abort >=0) { exit abort; }
  tup_process_current_batch("");
  printf "%7d VMS text lines found\n", n_lines > "/dev/stderr";
  printf "%7d interlinear text lines read\n", n_variants > "/dev/stderr";
  printf "%7d interlinear text lines used\n", n_used > "/dev/stderr";
  printf "%7d tuples written\n", n_tuples > "/dev/stderr";
}

# Client functions called by tup_process_current_batch:

function process_batch_texts(loc,txt,trn,nv,nc, tuple,j)
{
  # Called with the cleaned-up texts, without comments.
  # Extract the tuples and print them:
  split("", tuple);
  tup_extract_tuples(txt,trn,nv,nc,tuple);
  for (j=1; j<=nc; j++) { print tuple[j]; n_tuples++; }
  split("", tuple);  
  n_used += nv;
  n_lines ++;
}

function process_batch_lines(batch,nb)
{
  # Called with the original (and new) lines, including comments.
  # Ignore them:
}

function arg_error(msg)
{
  printf "*** %s\n", msg > "/dev/stderr"; 
  printf "usage: %s\n", usage > "/dev/stderr"; 
  abort = 1; exit abort;
}

function fatal_error(msg)
{
  printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
  abort = 1; exit abort;
}
    
function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}

function print_line(lin)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, lin > "/dev/stderr";
  printf "\n" > "/dev/stderr";
}