#! /usr/bin/gawk -f # Last edited on 2025-07-01 15:09:00 by stolfi # Reads from {stdin} a file in a format similar to that of the 1.6e6 EVT # interlinear, where the lines of each parag are joined in a single line # of the file. Writes to {stdout} the same data with one text line per # file. # On input, each parag must have the format "{ILOC} {PARAG}" where # {ILOC} is a locator and {PARAGS} is the text of the paragraph. The # locator must be "<{PAGE}.{UNIT}.{LNUM}>" or "<{PAGE}.{LNUM}>", {PAGE} # is a page's f-number (like "f103r" or "f111v"), {UNIT} is one capital # letter followed by one or more digits, and {LNUM} is a non-negative # line number in the page. The {PARAGS} must be a string that starts # with [=»], ends with [=«], and contains EVA text. The latter should # consist of words over the alphabet [a-z*] with separators '-' (line # break), '.' (word space), or ',' (possible word space). ??? Generate <%> <$> ??? Handle inline comments # The ouptut will have lines in similar format "{OLOC} {TLINE}" where # {OLOC} is "<{PAGE}.{LNUM}>" and {TLINE} is a text line. Each input # {PARAG} string is split at the '-' delmiters, with a '-' at end of the # left part and another '-' at the beginning of the right part. # Otherwise the words and delimiters of the input are preserved. For # each input parag, the first output line has the input {PAGE} and # {LNUM}, and each successive line has the {LNUM} incremented by 1. The # script aborts if this results in two output lines with the same # {LNUM}. # The input file may contain blank lines and '#'-comments. These are all # reproduced in the output file without change. BEGIN { abort = -1; opage = "??"; olnum = -1; nlins_in = 0; # Number of data lines in current input page. nlins_ot = 0; # Number of data lines in current output page. printf "«×»\n" > ".mts"; } (abort >= 0) { exit(abort); } # Remove trailing blanks: // { gsub(/[ \011]+$/, "", $0); } /^$/ { print ""; next; } /^[#]/ { print; next; } /^<[^<>]*>[ ]+/ { if (NF != 2) { prog_error(("unexpected {NF}")); } loc = $1; parag = $2; if (! match(parag, /^[=»][-.,a-z*]+[=«]$/)) { data_error(("invalid parag text")); } gsub(/^[<]/, "", loc); gsub(/[>]$/, "", loc); nsf = patsplit(loc, seps, /[.]/, locfs); # printf "%s nsf = %d\n", loc, nsf > "/dev/stderr" if (nsf == 2) { page = locfs[0]; unit = locfs[1]; lnum = locfs[2]; } else if (nsf == 1) { page = locfs[0]; unit = "P1"; lnum = locfs[1]; } else { data_error(("invalid locator <" loc ">")); } if (! match(page, /^f[1-9][0-9]*[rv][0-9]*$/)) { data_error(("bad page number \"" page "\"")); } if (! match(unit, /^[A-Z][1-9][0-9]*$/)) { data_error(("bad text unit number \"" unit "\"")); } if (! match(lnum, /^[0-9]+$/)) { data_error(("bad line number \"" unit "\"")); } nsp = patsplit(parag, seps, /[-]/, tlins); if (page != opage) { new_page(); } if ((olnum >= 0) && (lnum != olnum + 1)) { printf "!! skipped line numbers %d..%d\n", olnum+1, lnum-1 > "/dev/stderr"; } nlins_in++; for (i = 0; i <= nsp; i++) { if (lnum <= olnum) { data_error(("repeated line number " olnum " " lnum "")); } wloc = sprintf("<%s.%02d>", page, lnum); lsep = (i == 0 ? "" : "-"); rsep = (i == nsp ? "" : "-"); printf "%-19s%s%s%s\n", wloc, lsep, tlins[i], rsep; nlins_ot++; lnum++; } next; } // { data_error(("bad line format")); } END { if (abort >= 0) { exit abort; } new_page(); } function new_page() { if (opage != "") { printf "page %s - %d parags in, %d lines out\n", opage, nlins_in, nlins_ot > "/dev/stderr"; } opage = page; olnum = -1; nlins_in = 0; nlins_ot = 0; }