#! /usr/bin/gawk -f # Last edited on 2025-07-01 15:09:09 by stolfi # Reads from {stdin} a file in a format similar to that of the 1.6e6 EVT # interlinear. Writes the same to {stdout}, # with the lines of each parag joined in a single line of the # file. # On input, each parag must have the format "{ILOC} {TEXT}" where # {ILOC} is a locator and {TEXT} is one line of VMS text. # # The locator must be "<{PAGE}.{LNUM}>", where {PAGE} is a page's # f-number (like "f103r" or "f111v"), {UNIT} is one capital letter # followed by one or more digits, and {LNUM} is a non-negative line # number in the page. The {LNUM}s of each page must be consecutive # integers starting with 01. ?? Use <%> <$> ??? Handle inline comments # # The {TEXT} must be a string that starts # with [=»], ends with [=«], and contains EVA text. The latter should # consist of words over the alphabet [a-z*] with separators '.' # (word space), or ',' (possible word space). # # The initial separator of {TEXT} should be '=' for the head of a parag # that is left-justified (starts at or before the left rail), '»' for # the head of a parag that is indented (starts to the right of the left # rail), and '-' for any other line. # # The final separator of {TEXT} should be '=' for the tail of a parag # that is a long line (ends on or beyond the right rail), '«' for the # tail of a parag that is a short line (ends before the right rail), # and '-' for any other line. # # Thus each parag in the input would be one or more consecutive lines # such that the first one starts with [»=], the last one ends with [=«], # and all other initial and final delims are '-'. # The output will have lines in similar format "{OLOC} {PARAG}" where # {OLOC} is "<{PAGE}.{LNUM}>" and {PARAG} is the text of a paragraph. # Each parag in the input has its lines joined, with a single '-' # to indicate each join. The {LNUM} in {OLOC} is taken from # the {ILOC} of the head line. # The input file may contain blank lines and '#'-comments. These are all # reproduced in the output file without change. BEGIN { abort = -1; opage = ""; olnum = 0; nlins_in = 0; # Number of data lines in current input page. nlins_ot = 0; # Number of data lines in current output page. parag = ""; # Text of current parag so far. } (abort >= 0) { exit(abort); } # Remove trailing blanks: // { gsub(/[ \011]+$/, "", $0); } /^$/ { print ""; next; } /^[#]/ { print; next; } /^<[^<>]*>[ ]+/ { if (NF != 2) { prog_error(("unexpected {NF}")); } loc = $1; text = $2; if (! match(text, /^[-=»][.,a-z*]+[-=«]$/)) { data_error(("invalid text")); } gsub(/^[<]/, "", loc); gsub(/[>]$/, "", loc); nsf = patsplit(loc, seps, /[.]/, locfs); # printf "%s nsf = %d\n", loc, nsf > "/dev/stderr" if (nsf != 1) { data_error(("invalid locator <" loc "> nsf = " nsf "")); } page = locfs[0]; lnum = locfs[1] + 0; if (! match(page, /^f[1-9][0-9]*[rv][0-9]*$/)) { data_error(("bad page number \"" page "\"")); } if (! match(lnum, /^[0-9]+$/)) { data_error(("bad line number \"" unit "\"")); } if (page != opage) { new_page(); } olnum++; if (lnum != olnum) { data_error(("skipped line numbers " olnum ".." lnum-1)); } nlins_in++; if (substr(text,1,1) == "-") { # Continuation line: if (parag == "") { data_error(("no parag to continue")); } if (substr(parag, length(parag), 1) != "-") { prog_error(("parag end")); } parag = (substr(parag, 1, length(parag)-1) text); } else { # Start of new parag: if (parag != "") { data_error(("unterminated parag")); } parag = text; lnum_parag = lnum; } if (substr(parag, length(parag), 1) != "-") { # Parag complete: finished_parag(); } next; } // { data_error(("bad line format")); } END { if (abort >= 0) { exit(abort); } new_page(); } function new_page() { if (opage != "") { if (substr(parag, length(parag), 1) == "-") { data_error(("page ended with incomplete parag")); } printf "page %s - %d lines in, %d parags out\n", opage, nlins_in, nlins_ot > "/dev/stderr"; } opage = page; olnum = 0; nlins_in = 0; nlins_ot = 0; parag = ""; } function finished_parag() { wloc = sprintf("<%s.%02d>", page, lnum_parag); printf "%-19s%s\n", wloc, parag; nlins_ot++; parag = ""; }