#! /usr/bin/gawk -f
# Last edited on 2004-07-15 23:37:55 by stolfi
BEGIN {
abort = -1;
# Validates the format of the new split and unfolded interlinear file.
usage = ( \
"validate-new-evt-format \\\n" \
" [ -v chars=CHARS ] \\\n" \
" [ -v location=LOC ] \\\n" \
" [ -v checkTerminators=1 ] ] \\\n" \
" [ -v checkLineLengths=1 ] \\\n" \
" [ -v requireUnitHeaders=0 ] \\\n" \
" [ -v requirePageHeaders=0 ] \\\n" \
" < INFILE > OUTFILE " \
);
# where CHARS are the allowed non-space letters, and LOC is a
# location code without line number e.g. "f103v2.T1" or "f83r"
if (chars == "")
{ chars = "\"'abcdefghijklmnopqrstuvxyzAEFIKOPSTY"; }
if (chars ~ /[-=,.]/)
{ arg_error("invalid characters in \"chars\" list\n"); }
if (checkTerminators == "") { checkTerminators = 0; }
if (checkLineLengths == "") { checkLineLengths = 0; }
if (requireUnitHeaders == "") { requireUnitHeaders = 1; }
if (requirePageHeaders == "") { requirePageHeaders = 1; }
# The "location" parameter is used to check ## <..> lines
# exclusively. The values of 'fnum' and "unit" are extracted from
# those lines and used to check text lines.
# Location fields of previous line:
old_fn = ""; old_un = ""; old_ln = "";
nerrors = 0;
}
//{ if (abort >= 0) { exit abort; } }
# blank line
/^ *$/ {
next;
}
# `##'-comment (page/unit header)
/^##/ {
gsub(/^## */, "", $0);
if (! check_page_header($0)) { print_line(); nerrors++; }
next;
}
# other `#'-comment
/^#/ {
next;
}
# VTT-style page/unit header
/^<[^;<>]*>/ {
if (! check_page_header($0)) { print_line(); nerrors++; }
next;
}
# Text line
/^</ {
if (! check_text_line($0)) { print_line(); nerrors++; }
next
}
# Other lines
/./ {
format_error("bad line format");
print_line(); nerrors++;
next
}
END {
if (abort >= 0) { printf "aborted\n" > "/dev/stderr"; exit abort; }
printf "\n" > "/dev/stderr";
printf "%d errors flagged\n", nerrors > "/dev/stderr";
}
function check_text_line(lin, txt,loc,tmp,fn,un,tr,nf,res)
{
res = 1;
if (length(lin) <= 19)
{ format_error("missing text"); res = 0; }
# Check general format, and extract location code and text proper.
# Note that line number must start with digit,
# while the location code must start with letter:
match(lin, /^<f[0-9]+[vr]?[0-9]?[.][A-Za-z][A-Za-z0-9]*[.][0-9]+[a-e]?;[A-Z]>/);
if (RSTART != 1)
{ format_error("bad location format"); return 0; }
loc = substr(lin,RSTART+1,RLENGTH-2);
txt = substr(lin,RLENGTH+1);
if (substr(lin,RLENGTH+1, 19-RLENGTH) != substr(" ", 1, 19-RLENGTH))
{ format_error("too few blanks"); res = 0; }
if (substr(lin,20,1) == " ")
{ format_error("too many blanks"); res = 0; }
gsub(/^[ ]+/, "", txt);
gsub(/[ ]+$/, "", txt);
# Validate location code
# Split location into fields:
tmp = length(loc);
tr = substr(loc, tmp,1);
if (substr(loc, tmp-1, 1) != ";") { fatal_error("program error"); }
loc = substr(loc, i, tmp-2);
nf = split(loc, tmp, /[.]/);
if (nf != 3) { fatal_error("program error"); }
fn = tmp[1]; un = tmp[2]; ln = tmp[3];
# Check page f-number:
if (requirePageHeaders)
{ if (fnum == "")
{ format_error("missing page header line"); res = 0;
fnum = fn;
}
else
{ if (fn != fnum)
{ format_error(("wrong page f-number, should have been " fnum)); res = 0; }
}
}
else
{ fnum = fn; }
if (requireUnitHeaders)
{ # Check unit tag:
if (unit == "")
{ format_error("missing unit header line"); res = 0;
unit = un;
}
else
{ if (un != unit)
{ format_error(("wrong unit code, should have been " unit)); res = 0; }
}
}
else
{ unit = un; }
# Convert line numbers to pure number
if (match(ln, /[0-9]$/))
{ # Append a "0"
ln = (ln "0");
}
else
{ # Convert the final letter to a digit:
gsub(/[a]$/, "1", ln);
gsub(/[b]$/, "2", ln);
gsub(/[c]$/, "3", ln);
gsub(/[d]$/, "4", ln);
gsub(/[e]$/, "5", ln);
}
if (! match(ln, /^[0-9][0-9]*$/)) { fatal_error("program error"); }
# printf "[%s][%s][%s] -> [%s][%s][%s][%s]", \
# old_fn, old_un, old_ln, fn, un, ln, tr > "/dev/stderr";
# Check for non-decreasing line numbers:
if ((fn == old_fn) && (un == old_un))
{ if ((ln + 0) < (old_ln + 0))
{ format_error("lines out of order"); res = 0; }
}
# Check for repeated transcription code:
if ((fn == old_fn) && (un == old_un) && (ln == old_ln))
{ if (tr in tr_seen)
{ format_error("repeated transcription code"); res = 0; }
}
else
{ split("", tr_seen); }
tr_seen[tr] = 1;
# Validate line length
if (checkLineLengths)
{
# Remove trailing comments and fillers, if any
while (gsub(/{[^{}]*}$/, "", txt)) { }
nc = length(txt);
if ((fn == old_fn) && (un == old_un) && (ln == old_ln))
{ if ((old_nc != -1) && (nc != old_nc))
{ format_error(("inconsistent line lengths (" old_nc ":" nc ")"));
res = 0;
}
}
old_nc = nc;
}
# Validate text proper
# Remove '{}' comments
gsub(/{[^{}]*}/, "", txt);
# Ignore trailing blanks:
gsub(/ *$/, "", txt);
# Assume the '[|]' groups have been unfolded,
# otherwise we should do this:
# gsub(/\[[-*%A-Z.24678]*[|][-*%A-Z.24678]*\]/, "", txt);
# Remove non-significant fillers [!] but leave skip-markers [%]:
gsub(/[!]/, "", txt);
if (txt == "")
{
# Empty lines are OK.
}
else
{
# Remove weirdoes
gsub(/[&][0-9][0-9][0-9];/, "*", txt);
# Check for leading or double word breaks
# (one trailing word break is OK, e.g. in circular text.)
if (txt ~ /^[-.,]./)
{ format_error("leading [-.,]"); res = 0; }
if (txt !~ ("^[-.,*" chars "]*.$"))
{ format_error("invalid char in text"); res = 0; }
if (txt ~ /[-.,][-.,]/)
{ format_error("doubled [-.,]"); res = 0; }
if (txt !~ /[-=.,]$/)
{ format_error("text should end with [-=.,]"); res = 0; }
else
{ cr = substr(txt, length(txt), 1);
if ((fn == old_fn) && (un == old_un) && (ln == old_ln))
{ if ((old_cr != "") && (cr != old_cr) && (checkTerminators))
{ format_error(("inconsistent line terminator (" old_cr ":" cr ")"));
res = 0;
}
}
old_cr = cr;
}
}
old_fn = fn; old_un = un; old_ln = ln;
return res;
}
function check_page_header(lin, fn,att,fld,n,i,res)
{
res = 1;
if (! match(lin, /^<[^<>{}]*> *[{][^{}<>]*[}] *$/))
{ format_error("bad page locator line"); return 0; }
# Extract and check location code
if (! match(lin, /<.*>/)) { fatal_error("program error"); }
loc = substr(lin, RSTART+1, RLENGTH-2);
if (location != "")
{ if (fn != req_fnum)
{ format_error(("wrong page f-number, should have been" req_fnum)); res = 0; }
}
# decompose location code in "fnum" and "unit"
if (match(loc, /[.][A-Za-z0-9]+$/))
{ unit = substr(loc, RSTART+1, RLENGTH-1);
fnum = substr(loc, 1, RSTART-1);
}
else
{ unit = ""; fnum = loc; }
# Check page f-number, save in "fnum":
if (! match(fnum, /^f[0-9]+[vr]?[0-9]?$/))
{ format_error("bad f-number in ##-header"); res = 0; }
# Check attribute list
match(lin, /{.*}/);
if (RSTART == 0) { fatal_error("program error"); }
att = substr(lin, RSTART+1, RLENGTH-2);
n = split(att, fld, " ");
for (i=1;i<=n;i++)
{ if (! match(fld[i], /^ *[$][A-Z][=][A-Z0-9] *$/))
{ format_error("bad page attribute"); res = 0; }
}
return res;
}
function arg_error(msg)
{
printf "*** %s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1; exit abort;
}
function fatal_error(msg)
{
printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr";
abort = 1; exit abort;
}
function format_error(msg)
{
printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
}
function print_line()
{
printf "file %s, line %d: %s\n", FILENAME, FNR, $0 > "/dev/stderr";
printf "\n" > "/dev/stderr";
}