#! /usr/bin/python3 # Last edited on 2026-03-17 03:41:27 by stolfi # # Takes one command line argument, a unit type {utype}. # Reads from {stdin} a file with the Starred Parags section # (SPS) of the VMS in the ".ivt" format, similar to IVTFF. # Writes the same contents to {stdout} in the same format, with some # cleanup. # # The input file is assumed to be in the iso-8859-1 (Latin-1) encoding. # The output file will be in the Unicode UTF-8 encoding. # # On input, each data line must have the format "{ILOC} {TEXT}" where {ILOC} # is a locus ID and {TEXT} is one line of VMS text. # # The locus ID must be "<{PAGE}.{LSEQ};{TRANS},{POSTY}>", where {PAGE} # is a page's f-number (like "f103r" or "f111v"), {LSEQ} is a # non-negative line number in the page, {TRANS} is a transcriber's code # [A-Z]+, and {POSTY} is Rene's position-and-type code. The ";{TRANS}" # and/or the ",{POSTY}" parts may be missing. The {LSEQ}s within each # page must be consecutive integers starting with 1. # # In addition to data lines, the input file may also contain blank lines # and comment lines that start with '#'. Those lines are passed through, # but a line "# -*- coding: utf-8 -*-" is written at the beginning # of the output file. # # The input file may also contain page header lines that start with "<{PAGE}>". # These lines are discarded. # # See {cleanup_raw_starps_text} in {size_position_funcs.py} for what is # expected in the input {TEXT}, the cleanup actions performed on each # {TEXT}. This script preserves the parag head and tail markers "<%>" # and "<$>". # # Discards data lines whose {TEXT}, after cleanup, is empty or consists # entirely of '?' EVA characters. import sys, os, re from sys import stderr as err from error_funcs import prog_error, arg_error from process_funcs import bash import ivtff_format as ifm from note_077_funcs import compute_and_print_stats from math import floor, ceil import size_position_funcs as spf def main(utype): rd = sys.stdin rd.reconfigure(encoding='iso-8859-1') wr = sys.stdout wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") ndata_file = 0 # Count of parag data lines seen in whole file. nwrit_file = 0 # Count of parag lines written in whole file. cur_data_page = None # The {page} of the last *data* line seen (ignoring page headers). ndata_page = None; # Count of data lines in curent page. nwrit_page = None; # Count of parag data lines written for current page. ndatas_per_page = [ ] # Count of lines in each page of the file. def data_error(nline, msg, line): err.write(f"stdin:{nline}: ** {msg}\n") err.write(f" [[{line}]]\n") assert False # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def data_warning(nline, msg, line): err.write(f"stdin:{nline}: !! {msg}\n") err.write(f" [[{line}]]\n") return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def save_write_and_clear_page_stats(cur_pg): nonlocal ndata_page, ndata_file, nwrit_page, nwrit_file nonlocal ndatas_per_page if cur_pg != None: ndatas_per_page.append(ndata_page) err.write("%-11s" % cur_pg) err.write(f" {ndata_page:2d} data lines") err.write(f" {nwrit_page:2d} lines written") err.write("\n") ndata_page = 0 nwrit_page = 0 return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def process_starps_line(nline,ndata,npage, line, page,lseq,posty,trans,text): nonlocal cur_data_page, ndata_file, nwrit_file nonlocal ndata_page, nwrit_page nonlocal ndatas_per_page # Parses a line {line} assuming it is line {nline} of the file. Arguments: # # {nline} serial num of the line in the file, from 1. # {ndata} count of non-comment lines # {npage} count of distinct pages seen. # {line} whole line, as a string, without the trailing blanks or newline. # {lseq} integer line number in page, from 1. # {posty} Rene's "locator and type" code, e. g. "+P0" # {trans} a transcriber code, like "U" or "U2" # {text} the Voynichese text, with various markers. # # The {line} is always a string (never {None}), but may be "" if the # line is empty. # # The arguments {posty} {trans}' (transcriber code) may be strings # or {None}. This function just ignores them. # # If the line was blank or a '#'-comment, all other fields # ({page}, {lseq}, {posty}, {trans}, and {text}) are {None}. # This function ignores such lines. # # If the line was a page header, arguments {page} and {text} will be # strings (but the latter may be ""), while the {lseq}, {posty} and # {trans} fields will be {None}. This function ignores such lines. # # Otherwise the arguments {page} and {text} will be non-empty # strings, and {lseq} will be an integer. # def write_data_line(loc, head, text, tail): nonlocal nwrit_file, nwrit_page ploc = f"<{loc}>" wr.write("%-12s " % ploc) if head: wr.write("<%>") wr.write(text); if tail: wr.write("<$>") wr.write("\n"); nwrit_file += 1; nwrit_page += 1 return # .................................................................. def cleaning_error(msg): nonlocal nline, line data_error(nline, msg, line) # .................................................................... assert line != None, "The {line} arg must not be {None}" line = line.strip() # Pass through comments and blank lines: if page == None: if not re.search(r"^[# ]*[-][*][-] *coding:.*[-][*][-]", line): wr.write(line); wr.write("\n") return # Ignore page headers: if lseq == None: return #Check for change in {page}: if page != cur_data_page: save_write_and_clear_page_stats(cur_data_page) cur_data_page = page ndata_page += 1 ndata_file += 1 text, head, tail = spf.clean_up_starps_raw_text(text, utype, cleaning_error) loc = f"{page}.{lseq}" write_data_line(loc, head, text, tail) return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~.. nread_file, ndata_file, npage_file = ifm.line_loop(rd, process_starps_line, data_error) save_write_and_clear_page_stats(cur_data_page) err.write(f"{nread_file:7d} total file lines\n") err.write(f"{ndata_file:7d} non-comment, non-blank lines\n") err.write(f"{npage_file:7d} distinct pages seen on input\n") err.write(f"{ndata_file:7d} parag data lines found\n") err.write(f"{nwrit_file:7d} data lines written\n") npage_actual = 22.6 # Since the last line has only 30 out of 50 parag lines. vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("data lines per page", ndatas_per_page, npage_actual) wr.close() return # ---------------------------------------------------------------------- main(sys.argv[1])