#! /usr/bin/gawk -f
# Last edited on 2004-11-19 02:02:19 by stolfi

BEGIN {
  usage = ( \
    ARGV[0] "\\\n" \
    "  -v table=TBLFILE [ -v inverse=BOOL ] \\\n" \
    "  [ -v pedantic=BOOL ] \\\n" \
    "  < INFILE > OUTFILE " \
  );
  abort = -1;

  # Reads an EVMT file from stdin, maps the locators through a table,
  # and writes the result to stdout.
  #
  # Each line of the {table} file should have two words "{OLD} {NEW}" 
  # meaning that the old locator {OLD} should be mapped to {NEW}. 
  # The locators in this table should not include the "<>" delimiters
  # nor the transcriber codes.
  #
  # Whenever an input location code is not found in the table, 
  # it is left alone.  A message is printed if {pedantic} is true.

  abort = -1;
  if (table == "") arg_error("must specify \"-v table=FILE\"\n");
  if (inverse == "") { inverse = 0; }

  split("", dic);
  read_table(table,inverse,dic);
}

/^[\#]/ { 
  print;
  next;
}

/^ *$/ { 
  print;
  next;
}

function read_table(fname,inv,tbl,    ntbl,nlin,lin,fld,nfld,tmp)
{
  ntbl=0;
  nlin=0;
  while((getline lin < fname) > 0) { 
    nlin++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { nfld = split(lin, fld, " ");
        if ((nfld >= 3) && (fld[3] ~ /^[\#]/)) { nfld = 2; }
        if (nfld != 2) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); }
        # If {inv} is true, swap the two columns:
        if (inv) { tmp = fld[1]; fld[1] = fld[2]; fld[2] = tmp; }
        if (fld[1] in dic) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); }
        tbl[fld[1]] = fld[2];
        ntbl++;
      }
  }
  if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  # printf "loaded %6d map pairs\n", ntbl > "/dev/stderr"
}

(abort >= 0) { exit abort; }

/^(|[\#][\#] *)</ { 
  $0 = map_locator($0);
  print;
  next;
}

/^[\#]/ { 
  print;
  next;
}

/^ *$/ {
  next;
}

// { 
  format_error("bad line format");
  print;
  next;
}

function map_locator(str,  res,head,loc,tail,dlen)
{
  # Maps the first locator in string "str" through the table "dic".
  # Complains if not found and "pedantic" is true.
  # Tries to preserve the length

  if (! match($0, /<[a-z][0-9]+[rv]?[0-6]?(|[.][^<>; ]*)(|[;][A-Z])[>]/))
    { format_error("bad location code format"); }
  head = substr(str,1,RSTART);
  loc  = substr(str,RSTART+1,RLENGTH-2);
  tail = substr(str,RSTART+RLENGTH-1);
  
  if (match(loc, /[;]/)) 
    { trc = substr(loc,RSTART);
      loc = substr(loc,1,RSTART-1);
    }
  else
    { trc = ""; }
    
  dlen = length(loc);
  if (loc in dic) 
    { loc = dic[loc]; }
  else if (pedantic)
    { printf "file %s, line %d: locator not found: %s\n", FILENAME, FNR, loc > "/dev/stderr"; }
    
  dlen = length(loc) - dlen;
  while (dlen > 0) { sub(/^> /, ">", tail); dlen--; }
  while (dlen < 0) { sub(/^>/, "> ", tail); dlen++; }
  return (head loc trc tail);
}

function format_error(msg)
{ 
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1;
}