#! /usr/bin/python3 # Last edited on 2026-02-26 07:38:50 by stolfi # Takes as arguments a {book} ("bencao" or "starps"), a subset {bsub} # (e.g. "fu" or "gd"), and a unit type {unit} ("ch", "wc", etc.). # # Reads a transcription file "res/{book}-{bsub}-{enc}.ivp" in generic # IVTFF-like format, with one paragraph per line. Writes a file with the # size of each paragraph. # # Ignores spaces, punctuation, symbols, etc. Ignores #-comments and # blank lines. # # Each non-ignored input line must have the format "<{loc}> {text}" # where {loc} is a locus ID like "f105v.20" or "b2.1.050", # and {text} is a line of the transcribed text. # # Each output line has the format "{loc} {psize} {npunc}" where # {psize} is the number of units of a specified type (hanzi characters, # EVA characters, EVA words, pinyin syllables, ...) in the paragraph, # and {npunc} is the number of punctuation characters in it. # # The program takes two command line arguments, the text encoding {enc} # of the input file, and the {unit} for {psize}. The {unit} may be: # # "ch" the {enc} must be {chu} and the input must be hanzi # characters in Unicode, like # " 玉泉:味甘平。主治五脏百病。" # The size is the number of characters, excluding ideographic # punctuation. # # "ps" the {enc} must be "pys" and the input text must be in # pinyin, with isolated syllables, like # " yù quán : wèi gān píng. zhǔ zhì wǔ zàng bǎi bìng." # The size of a parag is the number of syllables. # # "ec" the {enc} must be "eva" abd the input must be Voynichese # in EVA, like # "", line): return tot_para += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error(f"invalid line format for {pat_line = !r}") # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) units, ct_sepa = spf.split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error) ct_unit = len(units) tot_unit += ct_unit tot_sepa += ct_sepa psize_raw = ct_unit psizes.append(psize_raw) if debug: err.write(f"!~ {loc:<12s} psize = {psize_raw}\n") upp_wr.write(f"{loc:<12s} {psize_raw:5d} {ct_sepa:5d}\n") return # ...................................................................... err.write(f"reading file '{ivp_file}' ...\n") nread = basic_line_loop(ivp_rd, process_input_line) ivp_rd.close() upp_wr.close() err.write(f"{nread:5d} total lines\n") err.write(f"{tot_para:5d} parags found\n") err.write(f"{tot_sepa:6d} total separator chars\n") err.write(f"{tot_unit:5d} total units\n") err.write(f"{tot_unit/tot_para:8.2f} avg units/parag\n") write_TeX_parms_file(book, bsub, unit, psizes, tot_para, tot_unit) return # ---------------------------------------------------------------------- def write_TeX_parms_file(book, bsub, unit, psizes, tot_para, tot_unit): # Writes the file "res/{book}-{bsub}-{unit}-upp-parms.tex" with # parameter defintions for LaTeX. sznum, sztot, szmin, szsin, szmax, szsax, szavg, szdev = \ compute_and_print_stats("parag sizes", psizes) assert sznum == tot_para assert sztot == tot_unit szmax_rel = szmax/szavg err.write(f"{szmax_rel:6.2f} max relative parag size\n") szmin_rel = szmin/szavg err.write(f"{szmin_rel:6.2f} min relative parag size\n") pref = f"{book}-{bsub}-{unit}" txpref = name_for_tex_macro(pref) tex_file = f"res/{pref}-upp-parms.tex" tex_wr = open(tex_file, "w") tex_wr.write(f"\\def\\{txpref}NumParags{{{tot_para}}}\n") tex_wr.write(f"\\def\\{txpref}TotUnits{{{tot_unit}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagMinUnits{{{szmin}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagMaxUnits{{{szmax}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagAvgUnits{{{szavg:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagDevUnits{{{szdev:.2f}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagSecMinUnits{{{szsin}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagSecMaxUnits{{{szsax}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerParagMinRelSize{{{szmin_rel:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}PerParagMaxRelSize{{{szmax_rel:.2f}}}\n") tex_wr.close() return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: book = sys.argv[1] bsub = sys.argv[2] unit = sys.argv[3] main(book, bsub, unit)