#! /usr/bin/gawk -f
# Last edited on 2025-07-01 15:09:00 by stolfi

# Reads from {stdin} a file in a format similar to that of the 1.6e6 EVT
# interlinear, where the lines of each parag are joined in a single line
# of the file. Writes to {stdout} the same data with one text line per
# file.

# On input, each parag must have the format "{ILOC} {PARAG}" where
# {ILOC} is a locator and {PARAGS} is the text of the paragraph. The
# locator must be "<{PAGE}.{UNIT}.{LNUM}>" or "<{PAGE}.{LNUM}>", {PAGE}
# is a page's f-number (like "f103r" or "f111v"), {UNIT} is one capital
# letter followed by one or more digits, and {LNUM} is a non-negative
# line number in the page. The {PARAGS} must be a string that starts
# with [=ť], ends with [=Ť], and contains EVA text. The latter should
# consist of words over the alphabet [a-z*] with separators '-' (line
# break), '.' (word space), or ',' (possible word space).

??? Generate <%> <$>
??? Handle inline comments

# The ouptut will have lines in similar format "{OLOC} {TLINE}" where
# {OLOC} is "<{PAGE}.{LNUM}>" and {TLINE} is a text line. Each input
# {PARAG} string is split at the '-' delmiters, with a '-' at end of the
# left part and another '-' at the beginning of the right part.
# Otherwise the words and delimiters of the input are preserved. For
# each input parag, the first output line has the input {PAGE} and
# {LNUM}, and each successive line has the {LNUM} incremented by 1. The
# script aborts if this results in two output lines with the same
# {LNUM}.

# The input file may contain blank lines and '#'-comments. These are all
# reproduced in the output file without change.

BEGIN {
  abort = -1;
  opage = "??"; olnum = -1; 
  nlins_in = 0; # Number of data lines in current input page.
  nlins_ot = 0; # Number of data lines in current output page.
  printf "Ť×ť\n" > ".mts";
}

(abort >= 0) { exit(abort); }

# Remove trailing blanks:
// { gsub(/[ \011]+$/, "", $0); }

/^$/ { print ""; next; }

/^[#]/ { print; next; }

/^<[^<>]*>[ ]+/ {
  if (NF != 2) { prog_error(("unexpected {NF}")); }
  loc = $1;
  parag = $2;
  if (! match(parag, /^[=ť][-.,a-z*]+[=Ť]$/)) { data_error(("invalid parag text")); }
  gsub(/^[<]/, "", loc);  gsub(/[>]$/, "", loc);
  nsf = patsplit(loc, seps, /[.]/, locfs);
  # printf "%s  nsf = %d\n", loc, nsf > "/dev/stderr"
  if (nsf == 2) {
    page = locfs[0];
    unit = locfs[1];
    lnum = locfs[2];
  } else if (nsf == 1) {
    page = locfs[0];
    unit = "P1";
    lnum = locfs[1];
  } else {
    data_error(("invalid locator <" loc ">"));
  }
  if (! match(page, /^f[1-9][0-9]*[rv][0-9]*$/)) { data_error(("bad page number \"" page "\"")); }
  if (! match(unit, /^[A-Z][1-9][0-9]*$/)) { data_error(("bad text unit number \"" unit "\"")); }
  if (! match(lnum, /^[0-9]+$/)) { data_error(("bad line number \"" unit "\"")); }
  nsp = patsplit(parag, seps, /[-]/, tlins);
  if (page != opage) { new_page(); }

  if ((olnum >= 0) && (lnum != olnum + 1)) {
    printf "!! skipped line numbers %d..%d\n", olnum+1, lnum-1 > "/dev/stderr";
  }
  nlins_in++;
  for (i = 0; i <= nsp; i++) {
    if (lnum <= olnum) { data_error(("repeated line number " olnum " " lnum "")); }
    wloc = sprintf("<%s.%02d>", page, lnum);
    lsep = (i == 0 ? "" : "-"); 
    rsep = (i == nsp ? "" : "-");
    printf "%-19s%s%s%s\n", wloc, lsep, tlins[i], rsep;
    nlins_ot++;
    lnum++;
  }
  next;
}

// { data_error(("bad line format")); }

END {
  if (abort >= 0) { exit abort; }
  new_page();
}

function new_page() {
  if (opage != "") { 
    printf "page %s - %d parags in, %d lines out\n", opage, nlins_in, nlins_ot > "/dev/stderr";
  }
  opage = page; olnum = -1;
  nlins_in = 0; nlins_ot = 0;
}