#! /usr/bin/gawk -f
# Last edited on 1998-12-22 12:32:30 by stolfi

BEGIN {
  usage = ( "best-pick [-v trcodes=STRING] [-v discardComments=1] < FILE.evt > FILE.evt" );
  abort = -1;

  # Reads an  EVT-format file and selects the "best" transliteration
  # for each line. Optionally discards #-comments.

  opgun = "";
  otrcd = "";
  ofile = "";
  oline = "";
  ocmts = "";
  if (trcodes == "") { trcodes = "UVHZTFABENOPRSWXYKQLMJIGCD"; }
  if (discardComments == "") { discardComments = 0; }
  len = length(trcodes);
  if ( len != 26 ) { arg_error("bad trcodes"); }
  for (i=1; i<=len; i++) { c = substr(trcodes, i, 1); trpri[c] = i; }
}

function oout()
{ 
  # Writes the current line "(oloc, otxt, ocmts)"
  # and clears it.
  if (opgun != "") 
    { printf "%-19s%s\n", oloc, otxt;
      oloc = ""; otxt = ""; 
    }
  printf "%s", ocmts;
  ocmts = "";
}

/^[#]/ { 
  if (abort >= 0) { exit abort; } 
  if (! discardComments) { ocmts = (ocmts $0 "\n"); }
  next;
}

/^ *$/{ 
  if (abort >= 0) { exit abort; } 
  next;
}

/^<.*> *$/{
  if (abort >= 0) { exit abort; } 
  oout(); print;
  next;
}

/^</ {
  if (abort >= 0) { exit abort; } 
  if (match($0, /^<f[0-9][0-9]*[vr][0-9]*(|[.][A-Za-z][A-Za-z0-9]*)[.]/)) 
    { tmp = substr($0,2,index($0,">")-2);
      skip = 19;
      # Analyze and regularize location code: 
      gsub(/[.;]/, " ", tmp);
      nf = split(tmp, locf);
      if (nf == 3) 
        { pgun = locf[1]; line = locf[2]; trcd = locf[3]; }
      else if (nf == 4)
        { pgun = (locf[1] "." locf[2]); line = locf[3]; trcd = locf[4]; }
      else 
        { format_error("bad locator fields"); }
      fnum = locf[1];
    }
  else
    { format_error("bad locator format"); }
  
  if (skip >= length($0)) next;
  txt = substr($0,1+skip);
  
  loc = sprintf ("<%s.%s;%s>", pgun, line, trcd);

  if ( pgun != opgun )
    { oout();
      opgun = pgun;
      oline = line;
      otrcd = "";
    }
  else if ( oline != line )
    { oout();
      oline = line;
      otrcd = "";
    }
  
  if (( otrcd == "" ) || ( trpri[trcd] < trpri[otrcd] ) )
    { 
      oloc = loc; otxt = txt; otrcd = trcd; 
    }
  next;
}

//{
  format_error("bad line format");
}

END{ 
  if (abort >= 0) { exit abort; } 
  oout();
}

function format_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg >> "/dev/stderr";
  abort = 1;
  print $0 > "/dev/stderr";
  exit 1;
}

function arg_error(msg)
{
  printf "%s\n", msg >> "/dev/stderr";
  printf "usage: %s\n", usage >> "/dev/stderr";
  abort = 1;
  exit 1;
}