#! /usr/bin/gawk -f # Last edited on 2025-05-04 15:44:11 by stolfi BEGIN { abort = -1; usage = ( "tokens-from-evt \\\n" \ " [ -v showLines=BOOL ] [ -v showParags=BOOL ] \\\n" \ " [ -v omitInitial=BOOL ] [ -v omitFinal=BOOL ] \\\n" \ " [ -v omitMedial=BOOL ] \\\n" \ " [ -v showLocation=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # By default, input line breaks and paragraph breaks are treated # just like any other wordspace. # If {showLines} is true, input line breaks are translated into a blank line, # input paragraph breaks are translated into two blank lines. # If {showLines} is false but {showParags} is true, input line breaks are # treated like word spaces, but paragraph breaks are translated into an # empty line. # If {omitInitial} is true, tokens immediately following a line or figure break are # omitted from the output. The option {omitFinal} is symmetric, and # {omitMedial} discards any tokens that are *not* adjacent to a line # or figure break. # If {showLocation} is true, prints the location code before every token # (but not in the blank lines). if (showLines == "") { showLines = 0; } if (showParags == "") { showParags = 0; } # It simplifies things if {showLines} implies {showParags} : if (showLines) { showParags = 1; } if (omitInitial == "") { omitInitial = 0; } if (omitMedial == "") { omitMedial = 0; } if (omitFinal == "") { omitFinal = 0; } if (omitInitial && omitMedial && omitFinal) { arg_error("omitting everything!"); } if (showLocation == "") { showLocation = 0; } # We first reduce the input file to a stream of tokens alternating # with single separators (".", "-", or "="). This stream is fed # through a filter {output_token} that converts the separators to # blank lines, as requested, and omits line -initial, -medial, or # -final tokens, as requested. leftover = "="; # Leftover separators from previous line curToken = ""; # Last token parsed but not printed, or "" if none. curInitial = 0; # TRUE if curToken was line-initial. nBlanks = 2; # Number of blank lines printed since last non-blank. nBlanksNeeded = 0; # Number of blank lines needed before next non-blank } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { next; } /^<[^<>;]*>/ { next; } // { lin = $0; # Extract line locator: if (match(lin, /^<[^>]*>/)) { loc = substr(lin, RSTART+1, RLENGTH-2); } else { loc = "f0r.P0.0;X"; } # Remove line locator: gsub(/^<[^>]*> */, "", lin); # Remove embedded comments (twice in case of nested "{}"s) gsub(/{[^{}]*}/, "", lin); gsub(/{[^{}]*}/, "", lin); # Remove fillers: gsub(/[!]/, "", lin); # Reduce all bad char codes to "?" gsub(/[?*%]/, "?", lin); if (lin == "") { next; } # Prepend leftover delimiter: lin = ( leftover lin ); while (match(lin, /^[-\/=., ]+[^-\/=., ]+/)) { # Isolate the next token with its preceding delimiter w = substr(lin, 1, RLENGTH); lin = substr(lin, RLENGTH + 1); # Split them apart: if (! match(w, /^[-\/=., ]+/)) { prog_error(("missing delimiter")); } delim = substr(w, 1, RLENGTH); w = substr(w, RLENGTH + 1); # Output delimiter and token: output_token(delim, w); } if (lin !~ /^[-\/=., ]*$/) { prog_error(("left food on plate")); } # Newlines in file are implicit word breaks: leftover = ( lin "."); next; } function output_token(delim,w, isBreak,isParBreak,omit,nB) { # Process another delimiter and another token. if (delim == "") {prog_error(("empty delim")); } if (w == "") { prog_error(("empty w")); } # printf "!! w = {%s}\n", w > "/dev/stderr" # printf "!! curToken = {%s}\n", curToken > "/dev/stderr" # Determine breaks between {curToken} and {w}: isBreak = (delim ~ /[-\/=]/); isParBreak = (delim ~ /[=]/); # printf "!! isBreak = %d isParBreak = %d\n", isBreak, isParBreak > "/dev/stderr" # Note that {isParBreak} implies {isBreak}. # Update blank lines needed bwtween {curToken} and {w}: nB = (isBreak && showLines) + (isParBreak && showParags); if (nBlanksNeeded < nB) { nBlanksNeeded = nB; } # Outputs {curToken}, if appropriate: if (curToken != "") { curFinal = isBreak; omit = 0; omit += (omitInitial && curInitial); omit += (omitFinal && curFinal); omit += (omitMedial && (!curInitial) && (!curFinal)); if (omit == 0) { if (showLocation) { printf "%s ", loc; } print curToken; while (nBlanks < nBlanksNeeded) { print ""; nBlanks++; } nBlanks = 0; nBlanksNeeded = 0; } } # Save {w} as the current token: curToken = w; # printf "\n" > "/dev/stderr" curInitial = isBreak; } END{ if (abort >= 0) { exit abort; } # Flush out last token: output_token("=", "EOF"); } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; printf "usage: %s\n", usage >> "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; }