#! /usr/bin/gawk -f
# Last edited on 1998-09-14 00:21:20 by stolfi

# usage: best-pick [-v trcodes=STRING] < FILE.evt > FILE.evt
#
# Reads an  EVT-format file and selects the "best" transliteration
# for each line. 

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg > "/dev/stderr"
  abort = 1
  exit
}

BEGIN {
  ofnum = "";
  otrcd = "";
  ofile = "";
  ounln = "";
  abort = 0;
  if (trcodes == "") { trcodes = "UVZABENOPRSWXYKQLMRJITFGCD"; }
  len = length(trcodes);
  if ( len != 26 ) { printf "line %d: bad trcodes\n", NR; abort=1; exit; }
  for (i=1; i<=len; i++) { c = substr(trcodes, i, 1); trpri[c] = i; }
}

function oout()
{ 
  # Writes the current line "(oloc, otxt)"
  # and clears it.
  if (ofnum != "") 
    { printf "%-19s%s\n", oloc, otxt;
      oloc = ""; otxt = "";
    }
}

/^#/ { if (abort) { exit 1; } next; }

/./ {
  if (abort) { exit 1; }
  if (match($0, /^<f[0-9][0-9]*[vr][0-9]*\.[A-Za-z][A-Za-z0-9]*\./)) 
    { tmp = substr($0,2,index($0,">")-2);
      skip = 19;
      # Analyze and regularize location code: 
      gsub(/[.;]/, " ", tmp);
      split(tmp, locf);
      if ((!(3 in locf)) || (5 in locf)) error("bad location format");
      fnum = locf[1];
      unit = locf[2];
      line = locf[3];
      if (4 in locf) 
        { trcd = locf[4]; }
      else
        { trcd = "X"; }
    }
  else if (substr($0,1,1) == "<") 
    { error("bad location code");
    }
  else 
    { skip = 0;
      fnum = "f0"; 
      unit = "P";
      line = NR;
    }
  
  if (skip >= length($0)) next;
  txt = substr($0,1+skip);
  
  loc = sprintf ("<%s.%s.%s;%s>", fnum, unit, line, trcd);
  unln = ( unit "." line );

  if ( fnum != ofnum )
    { oout();
      ofnum = fnum;
      ounln = unln;
      otrcd = "";
    }
  else if ( ounln != unln )
    { oout();
      ounln = unln;
      otrcd = "";
    }
  
  if (( otrcd == "" ) || ( trpri[trcd] < trpri[otrcd] ) )
    { 
      oloc = loc; otxt = txt; otrcd = trcd; 
    }
}

END{ 
  if (abort) { exit 1; }
  oout();
}