#! /usr/bin/python3 # Last edited on 2026-02-22 18:19:15 by stolfi # The command line arguments are a {book} ("bencao" or "starps"), the # tag {bsub} of a subset of the same ("fu" for full, "gd" for good, # etc), a text unit specifier ("ch", "ps", "ec", "wp", "wc", etc), and a # positive integer {tsize}ยท # Reads a file "res/{book}-{bsub}-{enc}.ivp" where {enc} is determined # by the {unit} ("chu" for unit "ch", "pys" for "ps", and "eva" for # "ec", "wc, and "wp"). The file must be in pseudo-IVTFF format, where # each line is "<{LOC}> {TEXT}". # # Extracts and writes all tuples of {tsize} consecutive units of the # specified type. Writes them to a file # "res/{book}-{bsub}-{unit}-{tsize}.tup" where each line has the format # "{LOC} {PSIZE} {TPOS} {TSIZE} {TUPLE}" where {TUPLE} is a tuple of # {TSIZE=tsize} units, {PSIZE} is the total number of units in the parag # {LOC}, {TPOS} is the number of units in the parag before that tuple. # # Before the tuples are extracted, the {TEXT} is cleaned # as per {spf.split_text_into_units}. import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats import size_position_funcs as spf from note_077_funcs import enc_from_unit def main(book, bsub, unit, tsize): if tsize < 1 or tsize > 20: arg_error (f"invalid {tsize}") enc = enc_from_unit(unit) ivp_file = f"res/{book}-{bsub}-{enc}.ivp" ivp_rd = open(ivp_file, "r") tup_file = f"res/{book}-{bsub}-{unit}-{tsize}.tup" tup_wr = open(tup_file, "w") tup_wr.write("# -*- coding: utf-8 -*-\n") ivp_rd.reconfigure(encoding='utf-8') tup_wr.reconfigure(encoding='utf-8') pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa = spf.get_parsing_patterns(enc, unit) tot_para = 0 # Count of data lines. tot_sepa = 0 # Count of separators in original texts. tot_unit = 0 # Total count of units in input file. tot_wtup = 0 # Total occurrences of {word} found. def process_input_line(nline, line): nonlocal enc, unit, ivp_file, tup_wr nonlocal pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa nonlocal tot_para, tot_unit, tot_sepa, tot_wtup # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line} # # Increments {tot_para} for each data line. # # Cleans the {text} according to {enc} and {unit}, updating # {tot_para,tot_sepa,tot_unit,tot_wtup} and counting the number {psize_raw} of # units if type {unit} in it. Then makes a list {oclist_raw} of the # raw positions of occurrences of {word} in it, counted as specified # by {enc} and {unit}. # # For each data line, appends its data to {locs,psizes,oclists}. # Should we debug the parag? debug = False def data_error(msg): nonlocal ivp_file, nline, line file_line_error(ivp_file, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return tot_para += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) # Cleanup text for searching and get count of units:: units, ct_sepa = \ spf.split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error) psize = len(units) tot_unit += psize tot_sepa += ct_sepa # Create list of tuples: ct_wtup = psize-tsize for iu in range(ct_wtup): wtup = clean_sepa.join(units[iu:iu+tsize]) output_tuple(tup_wr, loc, psize, iu, wtup) tot_wtup += len(wtup) if debug: err.write(f"!~ {loc:<12s} psize = {psize_raw} wtups = {ct_wtup}\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: err.write(f"reading file '{ivp_file}' ...\n") nread = basic_line_loop(ivp_rd, process_input_line) ivp_rd.close() err.write(f"{nread:6d} lines read\n") err.write(f"{tot_para:6d} parags found\n") err.write(f"{tot_sepa:6d} total separator chars\n") err.write(f"{tot_unit:5d} total units\n") err.write(f"{tot_unit/tot_para:8.2f} avg units/parag\n") err.write(f"{tot_wtup:5d} total tuples written\n") err.write(f"{tot_wtup/tot_para:8.2f} avg tuples/parag\n") tup_wr.close() return # ---------------------------------------------------------------------- def output_tuple(tup_wr, loc, psize, iu, wtup): # Writes the tuple {wtup} assuming that it comes from the # parag {loc} and starts at position {iu} in the list of units # Sanity checks: assert wtup != "", "empty tuple" assert not re.search(r"[ \000-\037]", wtup), "blanks in tuple" tup_wr.write(f"{loc:<12s} {psize:5d} {iu:5d} {wtup}\n") return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: book = sys.argv[1] bsub = sys.argv[2] unit = sys.argv[3] tsize = int(sys.argv[4]) main(book, bsub, unit, tsize)