#! /usr/bin/python3 # Last edited on 2026-03-06 07:48:47 by stolfi # Takes as arguments the name {ivt_name} of a transcription file, # without the folder or ".ivt" extension, and a text unit type {utype} # ("ch", "wc", etc.). # # Reads the transcription file "res/{ivt_name}.ivt" in pseudo-IVTFF # format. Writes a file "res/{ivt_name}.upp" with the raw size (number # of units) of each paragraph. # # Ignores #-comments and blank lines. # # Each non-ignored input line must have the format "<{LOC}> {TEXT}" # where {LOC} is a locus ID like "f105v.20" or "b2.1.050", and {TEXT} is # a line of the transcribed text. # # The unit type {utype} specifies the nature of the {TEXT} and of the # units used when measuring line sizes. IN particuler, this script # assumes that the {TEXT} has been cleaned-up according to the {utype}. # See {clean_up_raw_text} in {size_position_funcs.py} for details. # # In any case the input file is assumed to be in Unicode UTF-8 encoding, # and so will be the output file. # # Each output line has the format "{LOC} {PSIZE}" where # {PSIZE} is the number of units of a specified type (hanzi characters, # EVA characters, EVA words, pinyin syllables, ...) in the paragraph. # import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats, name_for_tex_macro import size_position_funcs as spf def main(ivt_name, utype): ivt_file = f"res/{ivt_name}.ivt" ivt_rd = open(ivt_file, "r") ivt_rd.reconfigure(encoding='utf-8') upp_file = f"res/{ivt_name}.upp" upp_wr = open(upp_file, "w") upp_wr.reconfigure(encoding='utf-8') upp_wr.write("# -*- coding: utf-8 -*-\n") pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype) tot_para = 0 # Count of data lines. tot_unit = 0 # Total count of units in input file. tot_sepa = 0 # Total chars removed in cleanup. psizes = [] # List of raw sizes of parags, for statistics. def process_input_line(nline, line): nonlocal tot_para, tot_unit, tot_sepa, psizes # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line}. # # Increments {tot_para} for each data line. # # Cleans the {text} according to {utype}, updating # {tot_sepa}, {tot_unit}. Then counts the number {psize_raw} of # units of type {utype} in it. # # Writes to stdout one output line for each input data line, # and appends {psize_raw} to {psizes}. # Should we debug the parag? debug = False def data_error(msg): nonlocal ivt_file, nline, line file_line_error(ivt_file, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return tot_para += 1 line = line.strip() m = re.fullmatch(pat_line, line) if m is None: # Invalid line format. data_error(f"invalid line format for {pat_line = !r}") # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) units, ct_sepa = spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error) ct_unit = len(units) tot_unit += ct_unit tot_sepa += ct_sepa psize_raw = ct_unit psizes.append(psize_raw) if debug: err.write(f"!~ {loc:<12s} {ct_unit = :5d} {ct_sepa = :5d}\n") upp_wr.write(f"{loc:<12s} {psize_raw:5d}\n") return # ...................................................................... err.write(f"reading file '{ivt_file}' ...\n") nread = basic_line_loop(ivt_rd, process_input_line) ivt_rd.close() upp_wr.close() err.write(f"{nread:5d} total lines\n") err.write(f"{tot_para:5d} parags found\n") err.write(f"{tot_sepa:6d} total separator chars\n") err.write(f"{tot_unit:5d} total units\n") err.write(f"{tot_unit/tot_para:8.2f} avg units/parag\n") write_TeX_parms_file(ivt_name, utype, psizes, tot_para, tot_unit) return # ---------------------------------------------------------------------- def write_TeX_parms_file(ivt_name, utype, psizes, tot_para, tot_unit): # Writes the file "res/{ivt_name}-upp-parms.tex" with # parameter defintions for LaTeX. sznum, sztot, szmin, szsin, szmax, szsax, szavg, szdev = \ compute_and_print_stats("parag sizes", psizes) assert sznum == tot_para assert sztot == tot_unit szmax_rel = szmax/szavg err.write(f"{szmax_rel:6.2f} max relative parag size\n") szmin_rel = szmin/szavg err.write(f"{szmin_rel:6.2f} min relative parag size\n") txpref = name_for_tex_macro(ivt_name) tex_file = f"res/{ivt_name}-upp-parms.tex" tex_wr = open(tex_file, "w") tex_wr.write(f"\\def\\{txpref}NumParags{{{tot_para}}}\n") tex_wr.write(f"\\def\\{txpref}TotUnits{{{tot_unit}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagMinUnits{{{szmin}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagMaxUnits{{{szmax}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagAvgUnits{{{szavg:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagDevUnits{{{szdev:.2f}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagSecMinUnits{{{szsin}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagSecMaxUnits{{{szsax}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagMinRelSize{{{szmin_rel:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagMaxRelSize{{{szmax_rel:.2f}}}\n") tex_wr.close() return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: ivt_name = sys.argv[1] utype = sys.argv[2] main(ivt_name, utype)