#! /usr/bin/python3 # Last edited on 2026-03-06 08:08:22 by stolfi # The command line arguments are a the name {ivt_name} of an input file, # a text unit type {utype} ("ch", "ps", "ec", "wp", "wc", etc), and a # positive integer {tsize}. Optionally there may be also keyword pattern # {kword} (which may be "NONE") and an filename-safe version {ktag} of # the same. # # Reads a file "res/{ivt_name}.ivt". The file must be in pseudo-IVTFF # format, where each line is "<{LOC}> {TEXT}". where {LOC} is a locus ID # like "f105v.20" or "b2.1.050", and {TEXT} is a line of the transcribed # text.. # # The unit type {utype} specifies the nature of the {TEXT} and of the # units used when measuring line sizes. In particuler, this script # assumes that the {TEXT} has been cleaned-up according to the {utype}. See # {clean_up_raw_text} in {size_position_funcs.py} for details. # # In any case the input file is assumed to be in Unicode UTF-8 encoding, # and so will be the output file. # # This script parses the {TEXT} into units of the specified {utype} # with {spf.split_text_into_units}. Then it extracts and writes all strings # that consist of {tsize} consecutive units. If the {kword} is not # "NONE", considers only tuples that begin with a unit that matches the # pattern {kword}. # # The {kword} pattern may include '^' and/or '$' and/or '\b' to specify # that it must match a prefix, suffix, or whole unit. This option # is obviously not available (or meaningful) if {utype} is "ch" or "ec".# # In any case the {kword} will be tested only against the leading unit. # It will never match two or more units. # # Writes the extracted tuples to a file # "res/{ivt_name}-{tsize}-with-{ktag}.tup". If {kword} is "NONE" omits # the "-with-{ktag}" part. # # Each extracted tuple is written as a separate line with the format # "{LOC} {PSIZE} {TPOS} {TSIZE} {TUPLE}" where {LOC} is the locus ID of # the input file line, {PSIZE} is the total number of units in that # line, {TSIZE} is the given {tsize}, {TUPLE} is the extracted tuple, # and {TPOS} is the number of units in the input {TEXT} before that # tuple. # # The {TUPLE} field of the output is the concatenation of the selected # {tsize} units, with the {clean_sepa} separator defined by # {spf.get_parsing_patterns(utype)}. import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats import size_position_funcs as spf def main(ivt_name, utype, tsize, kword, ktag): if tsize < 1 or tsize > 20: arg_error (f"invalid {tsize}") src_file = f"res/{ivt_name}.ivt" src_rd = open(src_file, "r") out_name = f"{ivt_name}-{tsize}" if kword != None: if ktag == None: arg_error(f"the {ktag = !r} argument cannot be omitted") out_name += f"-with-{ktag}" else: if ktag != None: arg_error(f"spurious {ktag = !r} argument") tup_file = f"res/{out_name}.tup" tup_wr = open(tup_file, "w") tup_wr.write("# -*- coding: utf-8 -*-\n") src_rd.reconfigure(encoding='utf-8') tup_wr.reconfigure(encoding='utf-8') pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype) tot_data = 0 # Count of data lines. tot_sepa = 0 # Count of separators in original texts. tot_unit = 0 # Total count of units in input file. tot_wtup = 0 # Total number of tuples extracted. psizes = [] # Number of units in each input line. ntupses = [] # Number of tuples in each input line. def process_input_line(nline, line): nonlocal tot_data, tot_unit, tot_sepa, tot_wtup nonlocal psizes, ntupses # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line} # # Increments {tot_data} for each data line. # # Cleans the {text} according to {utype}, updating # {tot_data,tot_sepa,tot_unit,tot_wtup} and counting the number {psize_raw} of # units of type {utype} in it. Then makes a list {oclist_raw} of the # raw positions of occurrences of {kword} in it, counted as specified # by {utype}. # # For each data line, appends its data to {psizes,ntupses}. # Should we debug the input line? debug = False def data_error(msg): nonlocal src_file, nline, line file_line_error(src_file, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" line = line.rstrip() # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return tot_data += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) # Cleanup text for searching and get count of units:: units, ct_sepa = \ spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error) psize = len(units) tot_unit += psize tot_sepa += ct_sepa # Create list of tuples: ct_wtup = psize - tsize ntups = 0 # Tuples found in this line. for iu in range(ct_wtup): if kword == None or re.search(kword, units[iu]): wtup = clean_sepa.join(units[iu:iu+tsize]) output_tuple(tup_wr, loc, psize, iu, wtup) ntups += 1 if debug: err.write(f"!~ {loc:<12s} units = {psize_raw} tuples = {ntups}\n") tot_wtup += ntups psizes.append(psize) ntupses.append(ntups) return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: err.write(f"reading file '{src_file}' ...\n") nread = basic_line_loop(src_rd, process_input_line) src_rd.close() tup_wr.close() err.write(f"{nread:6d} lines read\n") err.write(f"{tot_data:6d} data lines found\n") err.write(f"{tot_sepa:6d} total separator chars\n") err.write(f"{tot_unit:5d} total units\n") err.write(f"{tot_unit/tot_data:8.2f} avg units per input line\n") err.write(f"{tot_wtup:5d} total tuples written\n") err.write(f"{tot_wtup/tot_data:8.2f} avg tuples per input line\n") vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("units per data linee", psizes) assert vnum == tot_data assert vtot == tot_unit vnum, vtot, vmin, vsin, vmax, vsax, vavg, vdev = compute_and_print_stats("tuples per data linee", ntupses) assert vnum == tot_data assert vtot == tot_wtup return # ---------------------------------------------------------------------- def output_tuple(tup_wr, loc, psize, iu, wtup): # Writes the tuple {wtup} assuming that it comes from the # input line {loc} and starts at position {iu} in the list of units # Sanity checks: assert wtup != "", "empty tuple" assert not re.search(r"[ \000-\037]", wtup), "blanks in tuple" tup_wr.write(f"{loc:<12s} {psize:5d} {iu:5d} {wtup}\n") return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: narg = len(sys.argv) iarg = 1 ivt_name = sys.argv[iarg]; iarg += 1 utype = sys.argv[iarg]; iarg += 1 tsize = int(sys.argv[iarg]); iarg += 1 kword = None ktag = None if iarg < narg: kword = sys.argv[iarg]; iarg += 1 if kword == "NONE" or kword == "None" or kword == '': kword = None ktag = sys.argv[iarg]; iarg += 1 if ktag == "NONE" or ktag == "None" or ktag == '': ktag = None assert iarg == narg main(ivt_name, utype, tsize, kword, ktag)