#! /usr/bin/gawk -f
# Last edited on 2005-01-13 03:32:10 by stolfi
BEGIN {
abort = -1;
usage = ( ARGV[0] " \\\n" \
" [ -v trans=CODES ] [ -v require=BOOL ] \\\n" \
" [ -v clean=BOOL ] [ -v prefix=STR ] [ -vsuffix=STR ] \\\n" \
" [ -v txlen=NUM ] \\\n" \
" < INFILE \\\n" \
" > LOCLIST" \
);
# Reads a file in the EVMT or JS-interlinear format. Outputs one
# record per text line, containing the line locator (without "<>"s
# and without transcriber codes) and the first {txlen} chars of the
# line.
# If {clean} is TRUE (default), removes all fillers, comments,
# []-groups, weirdo codes, etc. If the {prefix} and/or {suffix}
# strings are given, they are concatenated with the text. All these
# options these take effect before the text is truncated.
#
# If there are multiple transcriptions of the same line, outputs
# only the most reliable one. The valid codes in order of INCREASING
# reliability are specified throught {trans} variable. If {require}
# is TRUE (default), there must be at least one valid translation for
# every locator that appears in the input file.
if (trans == "") { trans = "*"; }
if (require == "") { default = 1; }
if (txlen == "") { txlen = 20; }
if (clean == "") { clean = 1; }
if (prefix == "") { prefix = ""; }
if (suffix == "") { suffix = ""; }
oloc = "";
}
(abort >= 0) { exit abort; }
/^[<][a-z][0-9]+[rv][0-9]*[.]/ {
# Get line locator, clean it up:
loc = $1;
gsub(/[<>]/, "", loc);
# Extract the transcriber's code {trc} ("*" means "none").
if (match(loc, /[;][A-Za-z]$/))
{ trc = substr(loc, RSTART+1,1); loc = substr(loc, 1, RSTART-1); }
else
{ trc = "*"; }
# Find its reliability {rel}.
rel = index(trans, trc);
# Get sample text {txt}, clean it up, trim it:
txt = $0;
# Remove locator:
gsub(/^[<][^<>]*[>] */, "", txt);
if (clean)
{ # Remove {}-comments:
gsub(/[{][^{}]*[}]/, "", txt);
gsub(/[{][^{}]*[}]/, "", txt); # Just in case there are nested {}´s...
# Remove plumes:
gsub(/[\']/, "", txt);
# Remove spaces, fillers, line and parag markers, ()-markers:
gsub(/[-!,.\/= ()]/, "", txt);
# Gabriel-style weirdoes:
gsub(/[&][0-9]+[;]/, "*", txt);
# Rene-style weirdoes:
gsub(/[$][0-9][0-9][0-9]/, "*", txt);
# Keep only the first alternative in "[|]" groups:
gsub(/[[]/, "", txt);
gsub(/[|][a-z\'*|]*[]]/, "", txt);
}
# Trim text to the required length:
txt = substr((prefix txt suffix),1,txlen);
# Output it:
if (loc != oloc) { flush_loc(); oloc = loc; orel = -1; }
if (rel > orel) { orel = rel; otxt = txt; }
}
END {
if (abort >= 0) { exit abort; }
flush_loc();
}
function flush_loc()
{
if (oloc != "")
{ if (orel == 0)
{ if (require)
{ data_error(("no valid transcription for \"" oloc "\"")); }
}
else
{ print oloc, otxt; }
}
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1
}
function data_error(msg)
{
printf "line %d: %s\n", FNR, msg > "/dev/stderr";
printf " %s\n", $0 > "/dev/stderr";
abort = 1;
exit 1
}