#! /usr/bin/gawk -f
# Last edited on 2005-01-13 03:32:10 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n" \
    "  [ -v trans=CODES ] [ -v require=BOOL ] \\\n" \
    "  [ -v clean=BOOL ] [ -v prefix=STR ] [ -vsuffix=STR ] \\\n" \
    "  [ -v txlen=NUM ] \\\n" \
    "  < INFILE \\\n" \
    "  > LOCLIST" \
  );

  # Reads a file in the EVMT or JS-interlinear format. Outputs one
  # record per text line, containing the line locator (without "<>"s
  # and without transcriber codes) and the first {txlen} chars of the
  # line. 
  
  # If {clean} is TRUE (default), removes all fillers, comments,
  # []-groups, weirdo codes, etc. If the {prefix} and/or {suffix}
  # strings are given, they are concatenated with the text. All these
  # options these take effect before the text is truncated.
  # 
  # If there are multiple transcriptions of the same line, outputs
  # only the most reliable one. The valid codes in order of INCREASING
  # reliability are specified throught {trans} variable. If {require}
  # is TRUE (default), there must be at least one valid translation for 
  # every locator that appears in the input file.
  
  if (trans == "") { trans = "*"; }
  if (require == "") { default = 1; }
  if (txlen == "") { txlen = 20; }
  if (clean == "") { clean = 1; }
  if (prefix == "") { prefix = ""; }
  if (suffix == "") { suffix = ""; }
  
  oloc = "";
}

(abort >= 0) { exit abort; }

/^[<][a-z][0-9]+[rv][0-9]*[.]/ {
  
  # Get line locator, clean it up:
  loc = $1; 
  gsub(/[<>]/, "", loc);
  
  # Extract the transcriber's code {trc} ("*" means "none").
  if (match(loc, /[;][A-Za-z]$/))
    { trc = substr(loc, RSTART+1,1); loc = substr(loc, 1, RSTART-1); }
  else
    { trc = "*"; }
  
  # Find its reliability {rel}.
  rel = index(trans, trc);
  
  # Get sample text {txt}, clean it up, trim it:
  txt = $0; 
  # Remove locator:
  gsub(/^[<][^<>]*[>] */, "", txt);
  if (clean)
    { # Remove {}-comments:
      gsub(/[{][^{}]*[}]/, "", txt);
      gsub(/[{][^{}]*[}]/, "", txt); # Just in case there are nested {}´s...
      # Remove plumes: 
      gsub(/[\']/, "", txt);
      # Remove spaces, fillers, line and parag markers, ()-markers:
      gsub(/[-!,.\/= ()]/, "", txt);
      # Gabriel-style weirdoes:
      gsub(/[&][0-9]+[;]/, "*", txt);
      # Rene-style weirdoes:
      gsub(/[$][0-9][0-9][0-9]/, "*", txt);
      # Keep only the first alternative in "[|]" groups:
      gsub(/[[]/, "", txt);
      gsub(/[|][a-z\'*|]*[]]/, "", txt);
    }
  
  # Trim text to the required length:
  txt = substr((prefix txt suffix),1,txlen);
  
  # Output it:
  if (loc != oloc) { flush_loc(); oloc = loc; orel = -1; }
  if (rel > orel) { orel = rel; otxt = txt; }
}

END {
  if (abort >= 0) { exit abort; }
  flush_loc();
}

function flush_loc()
{
  if (oloc != "")
    { if (orel == 0)
        { if (require)
            { data_error(("no valid transcription for \"" oloc "\"")); }
        }
      else
        { print oloc, otxt; }
    }
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  printf "  %s\n", $0 > "/dev/stderr";
  abort = 1;
  exit 1
}