#! /usr/bin/python3 # Last edited on 2026-03-17 03:41:05 by stolfi # # Takes one command line argument, a unit type {utype}. # Reads from {stdin} a file with the Starred Parags section (SPS) of the # VMS in the ".ivt" format, similar to IVTFF. # # Writes the same contents to {stdout} in the same format, where the # lines of each parag joined in a single line of the file. The input # file is assumed to be in the Unicode UTF-8 encoding, and the output # file will be in the same encoding. # # Each input line should have the # format "<{LOC}> {TEXT}" where {LOC} is a locus ID an {TEXT} is a line # of Voynichese text in EVA. The {TEXT} is assumed to have been cleaned # as per {cleanup_raw_starps_text} in {size_position_funcs.py} # # The locus ID must be "<{PAGE}.{LSEQ}>", where {PAGE} # is a page's f-number (like "f103r" or "f111v"), {LSEQ} is a # non-negative line number in the page. # # In addition to data lines, the input file may also contain blank lines # and comment lines that start with '#'. Any such lines are ignored, # but a line "# -*- coding: utf-8 -*-" is written at the beginning # of the output file. # # The input file shoudlnot contain page header lines that start with "<{PAGE}>". # # The script assumes that every parag has a "<%>" prefix on its first # data line (the /head/) and a "<$>" suffix on its last data line (the # /tail/). # # Discard any data lines that are not part of a parag. # # The script joins the {TEXT} field of all the lines of each parag, # into a single string {PTEXT}. If {utype} is "wc" # or "wp", a '.' is inserted between the {TEXT}s of different lines. If # {utype} is "ec", the {TEXT}s are joined with no separator. # # The program then writes the result as a line "<{LOC}> {PTEXT}" where # {LOC} is the locus ID "{PAGE}.{LSEQ}" of the parag's head line. import sys, os, re from sys import stderr as err from error_funcs import prog_error, arg_error from process_funcs import bash import ivtff_format as ifm from note_077_funcs import compute_and_print_stats from math import floor, ceil import size_position_funcs as spf def main(utype): rd = sys.stdin rd.reconfigure(encoding='utf-8') wr = sys.stdout wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") npara_file = 0 # Count of parags seen in whole file. nplin_file = 0 # Count of parag data lines seen in whole file. npgpa_file = 0 # Count of pages with parags in whole file. nwrit_file = 0 # Count of parag lines written in whole file. cur_data_page = None # The {page} of the last *data* line seen (ignoring page headers). ndata_page = None; # Count of data lines in curent page. npara_page = None; # Count of parags seen in current page. nplin_page = None; # Count of parag data lines seen in current page. nwrit_page = None; # Count of parag data lines written for current page. head_lseq = None # Line number of the current parag's head, or {None} if not in a parag. nplin_para = None; # Count of parag data lines in current parag. parag_texts = [ ] # Text lines from the current parag, if any. nlins_per_parag = [ ] # Count of lines in each parag of the file. ndatas_per_page = [ ] # Count of lines in each page of the file. nparas_per_page = [ ] # Count of parags in each page of the file. def data_error(nline, msg, line): err.write(f"stdin:{nline}: ** {msg}\n") err.write(f" [[{line}]]\n") assert False # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def data_warning(nline, msg, line): err.write(f"stdin:{nline}: !! {msg}\n") err.write(f" [[{line}]]\n") return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def save_write_and_clear_page_stats(cur_pg): nonlocal ndata_page, npara_page, nplin_page nonlocal nplin_file, nwrit_page, nwrit_file, npgpa_file nonlocal ndatas_per_page, nparas_per_page if cur_pg != None: ndatas_per_page.append(ndata_page) nparas_per_page.append(npara_page) err.write("%-11s" % cur_pg) err.write(f" {ndata_page:2d} data lines") err.write(f" {npara_page:2d} parags") err.write(f" {nplin_page:2d} lines in parags") err.write(f" {nwrit_page:2d} lines written") err.write("\n") if npara_page > 0: npgpa_file += 1 ndata_page = 0 npara_page = 0 nplin_page = 0 nwrit_page = 0 return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def process_starps_line(nline,ndata,npage, line, page,lseq,posty,trans,text): nonlocal head_lseq, parag_texts, cur_data_page nonlocal npara_file, nplin_file, nwrit_file nonlocal ndata_page, npara_page, nplin_page, nwrit_page nonlocal nplin_para nonlocal nlins_per_parag, ndatas_per_page, nparas_per_page # Parses a line {line} assuming it is line {nline} of the file. Arguments: # # {nline} serial num of the line in the file, from 1. # {ndata} count of non-comment lines # {npage} count of distinct pages seen. # {line} whole line, as a string, without the trailing blanks or newline. # {lseq} integer line number in page, from 1. # {posty} Rene's "locator and type" code. Should be {None}. # {trans} a transcriber code, like "U" or "U2". Should be {None}. # {text} the Voynichese text, with various markers. # # The {line} is always a string (never {None}), but may be "" if the # line is empty. # # If the line was blank or a '#'-comment, all other fields # ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}. # This function ignores such lines. # # If the {page} is not {None} but the {lseq} is {None}, # the line looks like a page header. There should be no # such lines. # # Otherwise the arguments {page} and {text} will be non-empty # strings, and {lseq} will be an integer. # # Data lines not inside parags are discarded. # # When it is given a parag head line (with "<%>" prefix), it saves # its {lseq} to {head_lseq}. The texts of subsequent data lines are # collected. When it is given a parag tail line (with "<$>" suffix), # writes to {stdout} one output line with the parag's concatenated # text and resets {head_lseq} to {None}. def write_data_line(pg, hd_lseq, tl_lseq, tx): # Writes the parag line "<{pg}.{hd_lseq} {tx}". nonlocal nwrit_file, nwrit_page ploc = f"<{pg}.{hd_lseq}>" wr.write("%-12s " % ploc) wr.write(tx); wr.write("\n"); nwrit_file += 1; nwrit_page += 1 return # .................................................................. assert line != None, "The {line} arg must not be {None}" line = line.strip() # Discard comments and blank lines: if page == None: return # Ignore page headers: if lseq == None: data_error(nline, "page headers not allowed", line) #Check for change in {page}: if page != cur_data_page: save_write_and_clear_page_stats(cur_data_page) cur_data_page = page ndata_page += 1 text, head, tail = spf.clean_up_starps_raw_text(text, utype, data_error) if head: # Start of parag: if head_lseq != None: data_error(nline, "missing '<$>' of previous parag", line) npara_file += 1; npara_page += 1 head_lseq = lseq nplin_para = 0 if head_lseq == None: # Data line outside parag, ignore it: # write_data_line???(page, lseq, lseq, text) pass else: # Data line inside a parag: nplin_para += 1; nplin_page += 1; nplin_file += 1 parag_texts.append(text) if tail: if head_lseq == None: data_error(nline, "missing '<%>' of this parag", line) # Tail of parag: sep = "" if utype == "ec" else "." write_data_line(page, head_lseq, lseq, sep.join(parag_texts)) head_lseq = None parag_texts = [] nlins_per_parag.append(nplin_para) nplin_para = None return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~.. nread_file, ndata_file, npage_file = ifm.line_loop(rd, process_starps_line, data_error) save_write_and_clear_page_stats(cur_data_page) err.write(f"{nread_file:7d} total file lines\n") err.write(f"{ndata_file:7d} non-comment, non-blank lines\n") err.write(f"{npage_file:7d} distinct pages seen on input\n") err.write(f"{npgpa_file:7d} pages with parags\n") err.write(f"{npara_file:7d} parags\n") err.write(f"{nplin_file:7d} parag data lines found\n") err.write(f"{ndata_file-nplin_file:7d} data lines not in parags (omitted)\n") err.write(f"{nwrit_file:7d} data lines written\n") npage_actual = 22.6 # Since the last line has only 30 out of 50 parag lines. vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("parags per page", nparas_per_page, npage_actual) vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per page", ndatas_per_page, npage_actual) vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per parag", nlins_per_parag) wr.close() return # ---------------------------------------------------------------------- main(sys.argv[1])