#! /usr/bin/python3
# Last edited on 2026-02-10 06:53:19 by stolfi

import sys, os, re
from sys import stdout as out, stderr as err, stdin as inp
from process_funcs import bash
import ivtff_format as iv
from note_077_funcs import compute_and_print_stats
from math import floor, ceil

# Reads from {stdin} a file in a format similar to IVTFF. Writes the same
# to {stdout}, with the lines of each parag joined in a single line of
# the file.

# On input, each line must have the format "{ILOC} {TEXT}" where
# {ILOC} is a locus ID and {TEXT} is one line of VMS text. 
# 
# The locus ID must be "<{PAGE}.{LSEQ};{TRANS},{POSTY}>", where {PAGE}
# is a page's f-number (like "f103r" or "f111v"), {LSEQ} is a
# non-negative line number in the page, {TRANS} is a transcriber's code
# [A-Z]+, and {POSTY} is Rene's position-and-type code. The ";{TRANS}"
# and/or the ",{POSTY}" parts may be missing. The {LSEQ}s within each
# page must be consecutive integers starting with 1.
# 
# The {TEXT} must be a string that may contain any of these:
#
#   Prefix "<%>" marking the line as the head of a paragraph.
#
#   Prefix [«=»] to indicate the alignment of the start of the line
#     relative to the left rail.
#
#   EVA letters [A_Za-z].
#
#   Invalid EVA code '?'.
#
#   Weirdo codes "&{NNN};" (possibly without the ';') 
#     where {NNN} is three decimal digits.
#
#   Ligatures consisting of two or more EVA letters, '?', or
#     weirdo codes enclosed in braces "{...}".
#
#   Word separators [-.,].
#
#   Inline comments "<!...>", including various special comments
#     to indicate stars in the margin, wide linegaps, figure intrusions.
#     vellum folds, etc.
#
#   Suffix [«=»] to indicate the alignment of the end of the line
#     relative to the right rail.
#
#   Suffix "<$>" to mark the line as the tail of a parag.
#
# The input file may also contain blank lines, comment lines that start
# with '#', and page header lines that start with "<{PAGE}>".
#
# The {TEXT} must contain at least one EVA letter, '?', or weirdo code.
#
# The script copies to the output file all blank lines, comment lines,
# and page headers. 
#
# From data lines, it deletes all inline comments and alignment markers [«-»].
# 
# The script assumes that every parag has a "<%>" prefix on its first
# data line (the /head/) and a "<$>" suffix on its last data line (the
# /tail/). The script joins the {TEXT} field of all the lines each
# parag, cleaned as per above, into a single string {PTEXT}, with "-"
# separators and both the "<%>" prefix and "<$>" suffix. Then writes a
# line "{PAGE} {LHEAD} {LTAIL} {PTEXT}" where {PAGE} is the parag's page,
# {LHEAD} and {LTAIL} are the {LSEQ} fields of
# of the first and last lines.
# 
# Data lines that are not part of a parag are omitted.

def main():
  file_name = "stdin"
  inp.reconfigure(encoding='iso-8859-1')
  out.reconfigure(encoding='iso-8859-1')
  
  ndata_page = None;                 # Count of data lines in curent page.
  npara_page = None; npara_file = 0  # Count of parags seen, in current page and total
  nplin_page = None; nplin_file = 0  # Count of parag data lines seen (excl non-parag ones), ditto.
  nwrit_page = None; nwrit_file = 0  # Count of parag data lines written, ditto.
  npgpa_file = 0;                    # Count of pages with parags, total.
  nplin_para = None;                 # Count of parag data lines in current parag.
  
  cur_data_page = None  # The {page} of the last *data* line seen (ignoring page headers).
  head_lseq = None      # Line number of the current parag's head, or {None} is not in a parag.
  parag_texts = [ ]     # Text lines from the current parag, if any.
  parag_nlins = [ ]     # Count of lines in each parag.
  page_ndatas = [ ]     # Count of lines in each page.
  page_nparas = [ ]     # Count of parags in each page.
  
  def data_error(nline, msg, line):
    err.write(f"{file_name}:{nline}: ** {msg}\n")
    err.write(f"  [[{line}]]\n")
    sys.exit(1)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
  def save_write_and_clear_page_stats(cur_pg):
    nonlocal ndata_page, npara_page, nplin_page, nplin_file, nwrit_page, nwrit_file, npgpa_file
    nonlocal page_ndatas, page_nparas
    if cur_pg != None:
      page_ndatas.append(ndata_page)
      page_nparas.append(npara_page)
      err.write("%-11s" % cur_pg)
      err.write(f"  {ndata_page:2d} data lines")
      err.write(f"  {npara_page:2d} parags")
      err.write(f"  {nplin_page:2d} lines in parags")
      err.write(f"  {nwrit_page:2d} lines written")
      err.write("\n")
      if npara_page > 0: npgpa_file += 1
    ndata_page = 0
    npara_page = 0
    nplin_page = 0
    nwrit_page = 0
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  def process_starps_line(nline,ndata,npage, line, page,lseq,posty,trans,text):
    nonlocal head_lseq, parag_texts, cur_data_page
    nonlocal npara_file, nplin_file, nwrit_file
    nonlocal ndata_page, npara_page, nplin_page, nwrit_page
    nonlocal nplin_para
    nonlocal parag_nlins, page_ndatas, page_nparas

    # Parses a line {line} assuming it is line {nline} of the file.  Arguments:
    #
    #   {nline} serial num of the line in the file, from 1.
    #   {ndata} count of non-comment lines
    #   {npage} count of distinct pages seen.
    #   {line}  whole line, as a string, without the trailing blanks or newline.
    #   {lseq}  integer line number in page, from 1.
    #   {posty} Rene's "locator and type" code, e. g. "+P0"
    #   {trans} a transcriber code, like "U" or "U2"
    #   {text}  the Voynichese text, with various markers.
    #
    # The {line} is always a string (never {None}), but may be "" if the
    # line is empty.
    # 
    # The arguments {posty} {trans}' (transcriber code) may be strings
    # or {None}. This function just ignores them.
    # 
    # If the line was blank or a '#'-comment, all other fields
    # ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}.
    # This function ignores such lines.
    #
    # If the line was a page header, arguments {page} and {text} will be
    # strings (but the latter may be ""), while the {lseq}, {posty} and
    # {trans} fields will be {None}. This function ignores such lines.
    # 
    # Othwerwise the argumens {page}, and {text} will be non-empty
    # strings, and {lseq} will be an integer.
    #
    # Data lines not inside parags are discarded.
    #
    # When it is given a parag head line (with "<%>" prefix), it saves
    # its {lseq} to {head_lseq}. The texts of
    # subsequent data lines are collected. When it is given a parag tail
    # line (with "<$>" suffix), writes to stdout one output line with
    # the parag's concatenated text and resets {head_lseq} to
    # {None}.

    def write_data_line(pg, hl, tl, pref, tx, suff):
      nonlocal nwrit_file, nwrit_page
      out.write("%-8s %2d %2d " %  (pg, hl, tl))
      if pref != None: out.write(pref);
      out.write(tx);
      if suff != None: out.write(suff);
      out.write("\n");
      nwrit_file += 1; nwrit_page += 1
      return
      # ..................................................................

    assert line != None, "The {line} arg must not be {None}" 
    line = line.strip()

    # Pass through comments and blank lines:
    if page == None: 
      out.write(line); out.write("\n")
      return
    
    # Ignore page headers:
    if lseq == None: 
      if head_lseq != None:
        data_error(nline, "page header within parag", line)
      return
      
    #Check for change in {page}:
    if page != cur_data_page:
      save_write_and_clear_page_stats(cur_data_page)
      cur_data_page = page
      
    ndata_page += 1
      
    loc = f"<{page}.{lseq}>"
    
    # Remove inline comments:
    text = re.sub(r"[<][!][^<>]*[>]", "", text)
      
    # Check for parag markers:
    head = re.search(r"^[<][%][>]", text) != None
    tail = re.search(r"[<][$][>]$", text) != None
    
    # Remove all parag markers for now:
    text = re.sub(r"[<][%$][>]", "", text)

    # Remove alignment markers:
    text = re.sub(r"^[«=»]", "", text)
    text = re.sub(r"[«=»]$", "", text)

    # Check for spurious characters:
    if re.search(r"[\[\]<>()!%$_«=»]", text):
      data_error(nline, "invalid character in text", line)
    
    # Check for irregular spaces:
    if re.search(r"^[-,.]|[-,.][-,.]|[-,.]$", text):
      data_error(nline, "improper use of [-,.]", line)

    if head:
      # Start of parag:
      if head_lseq != None:
        data_error(nline, "missing '<$>' of previous parag", line)
      npara_file += 1; npara_page += 1
      head_lseq = lseq
      nplin_para = 0 

    if head_lseq == None:
      # Data line outside parag, ignore it:
      # write_data_line(page, lseq, lseq, None, text, None)
      pass
    else:
      # Data line inside a parag:
      nplin_para += 1; nplin_page += 1; nplin_file += 1
      parag_texts.append(text)
    
    if tail:
      if head_lseq == None:
        data_error(nline, "missing '<%>' of this parag", line)
      # Tail of parag:
      write_data_line(page, head_lseq, lseq, "<%>", "-".join(parag_texts), "<$>")
      head_lseq = None
      parag_texts = []
      parag_nlins.append(nplin_para)
      nplin_para = None
           
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~..

  nread_file, ndata_file, npage_file = iv.line_loop(inp, process_starps_line, data_error)
  save_write_and_clear_page_stats(cur_data_page)

  err.write(f"{nread_file:7d} total file lines\n")
  err.write(f"{ndata_file:7d} non-comment, non-blank lines\n")
  err.write(f"{npage_file:7d} distinct pages seen on input\n")
  err.write(f"{npgpa_file:7d} pages with parags\n")
  err.write(f"{npara_file:7d} parags\n")
  err.write(f"{nplin_file:7d} parag data lines found\n")
  err.write(f"{ndata_file-nplin_file:7d} data lines not in parags (omitted)\n")
  err.write(f"{nwrit_file:7d} data lines written\n")
  
  npage_actual = 22.6 # Since the last line has only 30 out of 50 parag lines.
  vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("parags per page",      page_nparas, npage_actual)
  vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per page",  page_ndatas, npage_actual)
  vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per parag", parag_nlins)
  
  out.flush()
  return
  # ----------------------------------------------------------------------

def arg_error(msg):
  err.write(f"** {msg}\n")
  err.write("usage:\n")
  err.write(usage)
  sys.exit(1)
  # ----------------------------------------------------------------------

def set_error(fname, nline, msg):
  err.write(f"{fname}:{nline}: ** {msg}\n")
  sys.exit(1)
  # ----------------------------------------------------------------------

main()