#! /usr/bin/gawk -f
# Last edited on 1998-12-03 04:06:05 by stolfi

BEGIN {
  abort = -1;
  usage = ( \
    "merge-version-into-interlin \\\n" \
    "  -v sourceFile=SRCFILE \\\n" \
    "  -v trashFile=TSHFILE \\\n" \
    "  -v transCodes=LETTERS \\\n" \
    "  < INFILE > OUTFILE" \
  );
  
  # Reads and EVMT-formatted interlinear transcription INFILE and
  # merges into it one more version SRCFILE, for transcriber codes
  # in LETTERS. Writes the result to the OUTFILE. Each text line from
  # SRCFILE is inserted before the first line from INFILE with the
  # same location code (f-number, unit, and line number). Also writes
  # to TSHFILE all records from SRCFILE that could not be merged,
  # including all comments.

  if (sourceFile == "") { arg_error("must specify \"-v sourceFile=FILE\"\n"); }
  if (trashFile == "") { arg_error("must specify \"-v trashFile=FILE\"\n"); }
  if (transCodes == "") { arg_error("must specify \"-v transCodes=LETTERS\"\n"); }
  # table "src" maps location code (minus transcriber) to 
  # the whole line.
  split("", src);
  
  sourceN = 0; # counts source records read.
  trashN = 0;  # counts rejected source records (incl comments).
  badN = 0;    # counts erroneous source records.
  goodN = 0;   # counts good source records.
  
  printf "reading source file...\n" > "/dev/stderr"; 
  while((getline lin < sourceFile) > 0) { 
    sourceN++;
    bad = 0;
    trash = 0;
    if (match(lin, /^ *$/)) 
      { trash = 1; }
    else if (substr(lin,1,1) == "#")
      { trash = 1; }
    else if (substr(lin,1,1) == "<")
      { if (! match(lin, /^<f[0-9]+[vr][0-9]?[.][A-Za-z][0-9]?[.][0-9]+[a-e]?[;][A-Z]>/))
          { source_error("bad location code"); bad = 1; }
        else
          { loc = substr(lin, RSTART+1, RLENGTH-2);
            m = length(loc);
            cod = substr(loc, m, 1);
            if (index(transCodes, cod) == 0)
              { source_error("wrong transcriber code"); bad = 1; }
            loc = substr(loc, 1, m-2);
            if (loc in src) { source_error("duplicate location"); bad = 1; }
          }
      }
    else
      { source_error("bad line type"); bad = 1; }
    
    if (bad || trash)
      { print lin > trashFile; trashN++; if (bad) { badN++; } }
    else
      { src[loc] = lin; goodN++; }
  }
  close (sourceFile);
  printf "read %6d source lines\n", sourceN > "/dev/stderr"
  printf "rejected %6d of them (including %d errors)\n", trashN, badN > "/dev/stderr"
  if (badN != 0) { abort = 1; exit abort; }

  printf "merging files...\n" > "/dev/stderr"
  old_loc = "";
  insertN = 0;
  unmatchedN = 0;
  sawBlank = 1;
}

/^ *$/ {
  if (abort >= 0) { exit abort; }
  next;
}

/^[#][#]/ {
  if (abort >= 0) { exit abort; }
  print "#"
  sawBlank = 1;
  print; next;
}

/^[#] *$/ {
  if (abort >= 0) { exit abort; }
  sawBlank = 1;
  print; next;
}

/^[#]/ {
  if (abort >= 0) { exit abort; }
  print; next;
}

/^[<]/ {
  if (abort >= 0) { exit abort; }
  lin = $0;
  if (! match(lin, /^<f[0-9]+[vr][0-9]?[.][A-Za-z][0-9]?[.][0-9]+[a-e]?[;][A-Z]>/))
    { input_error("bad location code"); loc = ""; }
  else
    { loc = substr(lin, RSTART+1, RLENGTH-2);
      m = length(loc);
      cod = substr(loc, m, 1);
      if (index(transCodes, cod) != 0) 
        { input_error("wrong transcriber code"); }
      loc = substr(loc, 1, m-2);
      if ((loc != old_loc) && (loc != ""))
        { if (! sawBlank) { print "#"; }
          if (loc in src) 
            { print src[loc]; delete src[loc]; insertN++; }
          old_loc = loc;
        }
    }
  print lin;
  sawBlank = 0;
  next;
}

// {
  if (abort >= 0) { exit abort; }
  input_error("unrecognized line type");
  next;
}

END {
  if (abort >= 0) { exit abort; }
  for (loc in src)
    { print src[loc] > trashFile; trashN++; unmatchedN++; }
  close(trashFile);
  printf "%7d unmatched source lines\n", unmatchedN > "/dev/stderr"
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function source_error(msg)
{
  printf "file %s, line %d: %s\n", sourceFile, sourceN, msg > "/dev/stderr";
}

function input_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}