#! /usr/bin/gawk -f # Last edited on 2025-12-08 14:47:58 by stolfi BEGIN { abort = -1; # Validates the format of the 25e1 interlinear file. # Flags errors to {stderr} and writes the offending lines # to {stdout}. # # Assumes that there are no isolated plumes (apostrophes or '~') # Assumes that there are no synch fillers, '!' or '%'. # Assumes that weirdos are represented as numeric codes "&{NNN};" # Ignores inline comments "" # Ignores or requires page headers # Ignores blank lines, and #-comments. # Unacceptable items: # Stolfi's temp markers "<[|:]>" anywhere except at start/end of line. # Parag markers "<[%$]>" anywhere except at start/end of line. # Unfolded alternatives ('[]' groups). # Nested or unpaired braces. # Nested or unpaired parens. # Leading or duplicated word-separators [-,.]. # Alignment markers [«=»] except at start/end of line. # Malformed weirdo codes '&NNN;'. # # The {locField} must be less than {dataField} # and the data field must be last in the line. usage = ( \ "validate-new-evt-format \\\n" \ " [ -v locField=NUM ] \\\n" \ " [ -v dataField=NUM ] \\\n" \ " [ -v validChars=CHARS ] \\\n" \ " [ -v checkLineDelims=BOOL ] \\\n" \ " [ -v checkLineOrder=BOOL ] \\\n" \ " [ -v requirePageHeaders=BOOL ] \\\n" \ " [ -v requireTranscriber=BOOL ] \\\n" \ " [ -v allowAlignmentMarks=BOOL ] \\\n" \ " [ -v allowParagMarks=BOOL ] \\\n" \ " INFILE " \ ); # where CHARS are the allowed non-space letters, and LOC is a # location code without line number e.g. "f103v2.T1" or "f83r" if (validChars == "") { validChars = "abcdefghijklmnopqrstuvxyz"; } if (validChars ~ /[-=«»,.{}()*!%?~'&;<>]/) { arg_error("invalid characters in \"validChars\" list\n"); } if (locField == "") { locField = 1; } if (dataField == "") { dataField = 2; } if (checkLineDelims == "") { checkLineDelims = 1; } if (checkLineOrder == "") { checkLineOrder = 1; } if (requirePageHeaders == "") { requirePageHeaders = 1; } if (requireTranscriber == "") { requireTranscriber = 1; } if (allowAlignmentMarks == "") { allowAlignmentMarks = 1; } if (allowParagMarks == "") { allowParagMarks = 1; } if (locField >= dataField) { arg_error(("bad locField = " locField " dataField = " dataField)) } # Location fields of previous line: old_fnum = ""; old_lseq = ""; nerrors = 0; } //{ if (abort >= 0) { exit abort; } } # blank line /^ *([#]|$)/ { next; } # Page header line: /^[<]f[0-9]+[rv][0-9]*[>]/ { gsub(/[ ]/, "", $2); if (NF > 2) { flag_format_error("bad page header line"); output_bad_line(); } else { if (! check_header_line($1, $2)) { output_bad_line(); } } next; } # Remove inline comments from everywhere: // { gsub(/[<][!][^<>]*[>]/, "", $0) } # Data line ((NF >= dataField) && match($(locField), /^[<][^<>]*[.][^<>]*[>]/)) { if (! check_data_line($(locField),$(dataField))) { output_bad_line(); } next } # Other lines /./ { flag_format_error("bad line format"); output_bad_line(); next } END { if (abort >= 0) { printf "aborted\n" > "/dev/stderr"; exit abort; } printf "\n" > "/dev/stderr"; printf "%d bad lines\n", nerrors > "/dev/stderr"; } function check_data_line(loc,dat, tmp,fnum,lseq,tr,res) { res = 1; if (! match(loc, /^[<]f[0-9]+[vr]?[0-9]?[.][0-9]+([;][A-Z]|)[>]/)) { flag_format_error("malformed locator"); return 0; } # Split {loc} into {fnum,lseq,tr} and validate: fnum = loc; gsub(/^[<]/, "", fnum); gsub(/[.].*$/, "", fnum) if (! match(fnum, /^f[0-9]+[rv][0-9]*$/)) { flag_format_error(("invalid page f-number '" fnum "'")); return 0; } lseq = loc; gsub(/^[<][^.]*[.]/, "", lseq); gsub(/[^0-9].*$/, "", lseq) if (! match(lseq, /^[0-9]+$/)) { flag_format_error(("invalid line sequence number '" lseq "'")); return 0; } tr = loc; gsub(/^[^;]*([;]|$)/, "", tr); gsub(/[>]$/, "", tr) if (tr == "") { if (requireTranscriber) { flag_format_error(("missing transcriber code in '" loc "'")); return 0; } } else if (! match(tr, /^[A-Z]$/)) { flag_format_error(("invalid transcriber code '" tr "'")); return 0; } # Check for increasing decreasing line numbers: if (checkLineOrder && (fnum == old_fnum) && (old_lseq != "")) { if ((lseq + 0) <= (old_lseq + 0)) { flag_format_error(("line numbers not increasing = " old_lseq " " lseq)); res = 0; } } # Check page f-number: if (requirePageHeaders) { if (old_fnum == "") { flag_format_error("missing page header line"); res = 0; old_fnum = fnum; } else { if (fnum != old_fnum) { flag_format_error(("wrong page f-number '" fnum "', should have been " old_fnum)); res = 0; } } } else { old_fnum = fnum; } # Validate the data proper # Assume the '[|]' groups have been unfolded, # otherwise we should do this: # gsub(/\[[-*%A-Z.24678]*[|][-*%A-Z.24678]*\]/, "", dat); if (dat == "") { # Should not happen, but just in case: flag_format_error("line has no data"); res = 0; } else { # Remove valid weirdoes gsub(/[&][0-9][0-9][0-9];/, "*", dat); if (allowAlignmentMarks) { # Remove alignment markers: gsub(/^[<][%][>][«=»]/, "<%>", dat); gsub(/^[«=»]/, "", dat); gsub(/[«=»][<][$][>]$/, "<$>", dat); gsub(/[«=»]$/, "", dat); } if (allowParagMarks) { # Remove parag markers: gsub(/^[<][%][>]/, "", dat); gsub(/[<][$][>]$/, "", dat); } # Check for leading or double word breaks # (one trailing word break is OK in circular text.) if (dat ~ /^[-.,]./) { flag_format_error("leading [-.,]"); res = 0; } # All chars must be {validChars} or [-.,?]: if (dat !~ ("^[-.,?" validChars "]*.$")) { match(dat, ("^[-.,?" validChars "]*")); badch = substr(dat,RLENGTH+1,1); flag_format_error("invalid char \"" badch "\" in data field"); res = 0; } # Check for consecutive word breaks: if (dat ~ /[-.,][-.,]/) { flag_format_error("doubled [-.,]"); res = 0; } } old_fnum = fnum; old_lseq = lseq; return res; } function check_header_line(loc,dat, fnum,res) { # Checks a page header line. # Sets the global variables {old_fnum,old_lseq} from it. res = 1; if (! match(loc, /^[<]f[0-9]+[rv][0-9]*[>]$/)) { flag_format_error("bad page header"); return 0; } # Extract and check the page f-number fnum = loc; gsub(/[<>]/, "", fnum) old_fnum = fnum; old_lseq = "" return res; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function fatal_error(msg) { printf "%s:%d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function flag_format_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function output_bad_line() { print $0; nerrors++; }