#! /usr/bin/python3 # Last edited on 2026-02-10 06:53:19 by stolfi import sys, os, re from sys import stdout as out, stderr as err, stdin as inp from process_funcs import bash import ivtff_format as iv from note_077_funcs import compute_and_print_stats from math import floor, ceil # Reads from {stdin} a file in a format similar to IVTFF. Writes the same # to {stdout}, with the lines of each parag joined in a single line of # the file. # On input, each line must have the format "{ILOC} {TEXT}" where # {ILOC} is a locus ID and {TEXT} is one line of VMS text. # # The locus ID must be "<{PAGE}.{LSEQ};{TRANS},{POSTY}>", where {PAGE} # is a page's f-number (like "f103r" or "f111v"), {LSEQ} is a # non-negative line number in the page, {TRANS} is a transcriber's code # [A-Z]+, and {POSTY} is Rene's position-and-type code. The ";{TRANS}" # and/or the ",{POSTY}" parts may be missing. The {LSEQ}s within each # page must be consecutive integers starting with 1. # # The {TEXT} must be a string that may contain any of these: # # Prefix "<%>" marking the line as the head of a paragraph. # # Prefix [«=»] to indicate the alignment of the start of the line # relative to the left rail. # # EVA letters [A_Za-z]. # # Invalid EVA code '?'. # # Weirdo codes "&{NNN};" (possibly without the ';') # where {NNN} is three decimal digits. # # Ligatures consisting of two or more EVA letters, '?', or # weirdo codes enclosed in braces "{...}". # # Word separators [-.,]. # # Inline comments "", including various special comments # to indicate stars in the margin, wide linegaps, figure intrusions. # vellum folds, etc. # # Suffix [«=»] to indicate the alignment of the end of the line # relative to the right rail. # # Suffix "<$>" to mark the line as the tail of a parag. # # The input file may also contain blank lines, comment lines that start # with '#', and page header lines that start with "<{PAGE}>". # # The {TEXT} must contain at least one EVA letter, '?', or weirdo code. # # The script copies to the output file all blank lines, comment lines, # and page headers. # # From data lines, it deletes all inline comments and alignment markers [«-»]. # # The script assumes that every parag has a "<%>" prefix on its first # data line (the /head/) and a "<$>" suffix on its last data line (the # /tail/). The script joins the {TEXT} field of all the lines each # parag, cleaned as per above, into a single string {PTEXT}, with "-" # separators and both the "<%>" prefix and "<$>" suffix. Then writes a # line "{PAGE} {LHEAD} {LTAIL} {PTEXT}" where {PAGE} is the parag's page, # {LHEAD} and {LTAIL} are the {LSEQ} fields of # of the first and last lines. # # Data lines that are not part of a parag are omitted. def main(): file_name = "stdin" inp.reconfigure(encoding='iso-8859-1') out.reconfigure(encoding='iso-8859-1') ndata_page = None; # Count of data lines in curent page. npara_page = None; npara_file = 0 # Count of parags seen, in current page and total nplin_page = None; nplin_file = 0 # Count of parag data lines seen (excl non-parag ones), ditto. nwrit_page = None; nwrit_file = 0 # Count of parag data lines written, ditto. npgpa_file = 0; # Count of pages with parags, total. nplin_para = None; # Count of parag data lines in current parag. cur_data_page = None # The {page} of the last *data* line seen (ignoring page headers). head_lseq = None # Line number of the current parag's head, or {None} is not in a parag. parag_texts = [ ] # Text lines from the current parag, if any. parag_nlins = [ ] # Count of lines in each parag. page_ndatas = [ ] # Count of lines in each page. page_nparas = [ ] # Count of parags in each page. def data_error(nline, msg, line): err.write(f"{file_name}:{nline}: ** {msg}\n") err.write(f" [[{line}]]\n") sys.exit(1) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def save_write_and_clear_page_stats(cur_pg): nonlocal ndata_page, npara_page, nplin_page, nplin_file, nwrit_page, nwrit_file, npgpa_file nonlocal page_ndatas, page_nparas if cur_pg != None: page_ndatas.append(ndata_page) page_nparas.append(npara_page) err.write("%-11s" % cur_pg) err.write(f" {ndata_page:2d} data lines") err.write(f" {npara_page:2d} parags") err.write(f" {nplin_page:2d} lines in parags") err.write(f" {nwrit_page:2d} lines written") err.write("\n") if npara_page > 0: npgpa_file += 1 ndata_page = 0 npara_page = 0 nplin_page = 0 nwrit_page = 0 return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def process_starps_line(nline,ndata,npage, line, page,lseq,posty,trans,text): nonlocal head_lseq, parag_texts, cur_data_page nonlocal npara_file, nplin_file, nwrit_file nonlocal ndata_page, npara_page, nplin_page, nwrit_page nonlocal nplin_para nonlocal parag_nlins, page_ndatas, page_nparas # Parses a line {line} assuming it is line {nline} of the file. Arguments: # # {nline} serial num of the line in the file, from 1. # {ndata} count of non-comment lines # {npage} count of distinct pages seen. # {line} whole line, as a string, without the trailing blanks or newline. # {lseq} integer line number in page, from 1. # {posty} Rene's "locator and type" code, e. g. "+P0" # {trans} a transcriber code, like "U" or "U2" # {text} the Voynichese text, with various markers. # # The {line} is always a string (never {None}), but may be "" if the # line is empty. # # The arguments {posty} {trans}' (transcriber code) may be strings # or {None}. This function just ignores them. # # If the line was blank or a '#'-comment, all other fields # ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}. # This function ignores such lines. # # If the line was a page header, arguments {page} and {text} will be # strings (but the latter may be ""), while the {lseq}, {posty} and # {trans} fields will be {None}. This function ignores such lines. # # Othwerwise the argumens {page}, and {text} will be non-empty # strings, and {lseq} will be an integer. # # Data lines not inside parags are discarded. # # When it is given a parag head line (with "<%>" prefix), it saves # its {lseq} to {head_lseq}. The texts of # subsequent data lines are collected. When it is given a parag tail # line (with "<$>" suffix), writes to stdout one output line with # the parag's concatenated text and resets {head_lseq} to # {None}. def write_data_line(pg, hl, tl, pref, tx, suff): nonlocal nwrit_file, nwrit_page out.write("%-8s %2d %2d " % (pg, hl, tl)) if pref != None: out.write(pref); out.write(tx); if suff != None: out.write(suff); out.write("\n"); nwrit_file += 1; nwrit_page += 1 return # .................................................................. assert line != None, "The {line} arg must not be {None}" line = line.strip() # Pass through comments and blank lines: if page == None: out.write(line); out.write("\n") return # Ignore page headers: if lseq == None: if head_lseq != None: data_error(nline, "page header within parag", line) return #Check for change in {page}: if page != cur_data_page: save_write_and_clear_page_stats(cur_data_page) cur_data_page = page ndata_page += 1 loc = f"<{page}.{lseq}>" # Remove inline comments: text = re.sub(r"[<][!][^<>]*[>]", "", text) # Check for parag markers: head = re.search(r"^[<][%][>]", text) != None tail = re.search(r"[<][$][>]$", text) != None # Remove all parag markers for now: text = re.sub(r"[<][%$][>]", "", text) # Remove alignment markers: text = re.sub(r"^[«=»]", "", text) text = re.sub(r"[«=»]$", "", text) # Check for spurious characters: if re.search(r"[\[\]<>()!%$_«=»]", text): data_error(nline, "invalid character in text", line) # Check for irregular spaces: if re.search(r"^[-,.]|[-,.][-,.]|[-,.]$", text): data_error(nline, "improper use of [-,.]", line) if head: # Start of parag: if head_lseq != None: data_error(nline, "missing '<$>' of previous parag", line) npara_file += 1; npara_page += 1 head_lseq = lseq nplin_para = 0 if head_lseq == None: # Data line outside parag, ignore it: # write_data_line(page, lseq, lseq, None, text, None) pass else: # Data line inside a parag: nplin_para += 1; nplin_page += 1; nplin_file += 1 parag_texts.append(text) if tail: if head_lseq == None: data_error(nline, "missing '<%>' of this parag", line) # Tail of parag: write_data_line(page, head_lseq, lseq, "<%>", "-".join(parag_texts), "<$>") head_lseq = None parag_texts = [] parag_nlins.append(nplin_para) nplin_para = None return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~.. nread_file, ndata_file, npage_file = iv.line_loop(inp, process_starps_line, data_error) save_write_and_clear_page_stats(cur_data_page) err.write(f"{nread_file:7d} total file lines\n") err.write(f"{ndata_file:7d} non-comment, non-blank lines\n") err.write(f"{npage_file:7d} distinct pages seen on input\n") err.write(f"{npgpa_file:7d} pages with parags\n") err.write(f"{npara_file:7d} parags\n") err.write(f"{nplin_file:7d} parag data lines found\n") err.write(f"{ndata_file-nplin_file:7d} data lines not in parags (omitted)\n") err.write(f"{nwrit_file:7d} data lines written\n") npage_actual = 22.6 # Since the last line has only 30 out of 50 parag lines. vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("parags per page", page_nparas, npage_actual) vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per page", page_ndatas, npage_actual) vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per parag", parag_nlins) out.flush() return # ---------------------------------------------------------------------- def arg_error(msg): err.write(f"** {msg}\n") err.write("usage:\n") err.write(usage) sys.exit(1) # ---------------------------------------------------------------------- def set_error(fname, nline, msg): err.write(f"{fname}:{nline}: ** {msg}\n") sys.exit(1) # ---------------------------------------------------------------------- main()