#! /usr/bin/python3 # Last edited on 2026-03-07 09:17:04 by stolfi # The command line arguments are a the name {ivt_name} of an input file, # a text unit type {utype} ("ch", "ps", "ec", "wp", "wc", etc), a keyword pattern # {kword} and a filename-safe version {ktag} of # the same. # # Reads the file "res/{ivt_name}.ivt" The file should by # an IVTFF-like format, in the format "<{LOC}> {TEXT}". Finds all # occurrences of a given string or RE pattern {kword} in the {TEXT} of # each line, assuming that ist nature is that described by {utype}. # # Writes to "res/{ivt_name}-{ktag}.wpo" a line for each such occurrence. # Each line has the format "{LOC} {PSIZE} {WPOS}", where {LOC} is the # input line's locus ID; {PSIZE} is the length of the {TEXT} in that # line; and {WPOS} is the the position of the occurence of the word, # namely the length of the part of the cleaned input {TEXT} that # precedes that occurrence. # # The unit type {utype} specifies the nature of the {TEXT} and of the # units used when measuring line sizes and match positions. In particuler, this script # assumes that the {TEXT} has been cleaned-up according to the {utype}. See # {clean_up_raw_text} in {size_position_funcs.py} for details. # # In any case the input file is assumed to be in Unicode UTF-8 encoding, # and so will be the output file. # # If {utype} is "ch" or "ec", the {kword} pattern is searched on the {TEXT} as # asubtring. In the "ec" case,{kword} must match only the characters '[a-z?]'; # in the "ch" case it must match only simplified hanzi characters. In # eiter case it must not contain any punctuation or the specialpatterns # '^', '$', and '\b'. Raw parag sizes and positions are measured in EVA # or hanzi characters. # # If {utype} is "ps", the {kword} pattern must match one or more pinyin # syllables separated by single blanks. The pattern will be modified so that # it matches only one or more whole syllables. The line size and word # positions are counted in syllables. # # If {utype} is "wc", or "wp", the {kword} must match EVA characters # [a-z?.] It may use '.' or '\b' to force the matching of whole units # (Voynichese tokens); otherwise the matched string may begin with # proper suffix of a unit, and/or may end with a proper prefix of a # unit. # # In any case, the {PSIZE} is the totalcount of units in the {TRXT}, # and {WPOS} is counts the count of units of {TEXT} that precede the matched # substring. If {kword} matches a proper suffix of a unit, the unmatched # part is as half a unit in the raw position, irrespective of its length # import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats, name_for_tex_macro import size_position_funcs as spf def main(ivt_name, utype, kword, ktag): # {ivt_name} Name of input ".ivt" file without extension or folder. # {utype} Unit for raw parag lengths and word positions: "ch", "ps", "ec", "wc, "wp". # {kword} String or RE pattern to search. # {ktag} String that identifies {kword} for output file names. in_file = f"res/{ivt_name}.ivt" rd = open(in_file, "r") rd.reconfigure(encoding='utf-8') out_file = f"res/{ivt_name}-{ktag}.wpo" wr = open(out_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype) tot_line = 0 # Count of data lines. tot_unit = 0 # Total count of units in input file. tot_sepa = 0 # Total chars removed in cleanup. tot_wocc = 0 # Total occurrences of {kword} found. loc_list = [] # Locus IDs of the input lines, without [<>]. psize_list = [] # List of all unit counts of input lines. wpos_list_list = [] # List of lists of raw word positions in line. def process_input_line(nline, line): nonlocal tot_line, tot_unit, tot_sepa, tot_wocc nonlocal loc_list, psize_list, wpos_list_list # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line} # # Increments {tot_line} for each data line. # # Parses the text into units as specified by {utype}. Updates # {tot_sepa}, {tot_unit}. Then makes a list {wpos_list} of the # raw positions of occurrences of {kword} in it, counted as specified # by {utype}. # # For each data line, appends its data to {loc_list, psize_list, # wpos_list_list}. # Should we debug the line? debug = False def data_error(msg): nonlocal in_file, nline, line file_line_error(in_file, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return tot_line += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") # Parse the line into locus ID and text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) text = m.group(2) # Split text into units: units, ct_sepa = spf.split_text_into_units(text, utype, pat_unit, pat_sepa, data_error) ct_unit = len(units) tot_unit += ct_unit tot_sepa += ct_sepa # Compute raw line size and raw occurrences of {kword}: psize = ct_unit wpos_list = spf.list_occurrences(kword, units, clean_sepa, utype, data_error) tot_wocc += len(wpos_list) # Store for processing at end: loc_list.append(loc) psize_list.append(psize) wpos_list_list.append(wpos_list) if debug: err.write(f"!~ {loc:<12s} units = {psize} occs = {wpos_list}\n") return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: err.write(f"reading file '{in_file}' ...\n") nread = basic_line_loop(rd, process_input_line) rd.close() tot_line = len(loc_list) assert len(wpos_list_list) == tot_line avg_wocc = tot_wocc/tot_line # Average occurrences per line. err.write(f"{nread:6d} lines read\n") err.write(f"{tot_line:6d} data lines found\n") err.write(f"{tot_sepa:6d} total separator chars\n") err.write(f"{tot_unit:5d} total units\n") err.write(f"{tot_wocc:5d} total occurrences of {kword}\n") err.write(f"{avg_wocc:8.2f} avg occurrences/line\n") err.write(f"writing {out_file} with occurrences of '{kword}' ...\n") output_data(wr, loc_list, psize_list, wpos_list_list) wr.close() write_TeX_parms_file(ivt_name, utype, ktag, psize_list, wpos_list_list, tot_line, tot_unit, tot_wocc) return # ---------------------------------------------------------------------- def output_data(wr, loc_list, psize_list, wpos_list_list): debug = False for loc, psize, wpos_list in zip(loc_list, psize_list, wpos_list_list): if len(wpos_list) > 0: for wpos in wpos_list: if debug: err.write(f"!~ {loc:<12s} {psize = :6.1f} {wpos = :6.1f}\n") wr.write("%-12s %6.2f %6.2f\n" % (loc, psize, wpos)) wr.write("\n") wr.flush() return # ---------------------------------------------------------------------- def write_TeX_parms_file(ivt_name, utype, ktag, psize_list, wpos_list_list, tot_line, tot_unit, tot_wocc): # Writes the file "res/{ivt_name}-{ktag}-woc-parms.tex" with # parameter defintions for LaTeX. # List ana analyze the COUNT of occurrences per input file line: npl_list = [ len(wpos_list) for wpos_list in wpos_list_list ] nplnum, npltot, nplmin, nplsin, nplmax, nplsax, nplavg, npldev = \ compute_and_print_stats("occurrences of the word per line", npl_list) assert nplnum == tot_line assert npltot == tot_wocc err.write("\n") # List and analyze all POSITIONS of the occurrences: wpo_list = [ wpo for sub in wpos_list_list for wpo in sub ] wponum, wpotot, wpomin, wposin, wpomax, wposax, wpoavg, wpodev = \ compute_and_print_stats("Positions of word per line", wpo_list) assert wponum == tot_wocc # Count lines with and without the word: tot_line_with_word = 0; for sub in wpos_list_list: if len(sub) > 0: tot_line_with_word += 1 tot_line_sans_word = tot_line - tot_line_with_word err.write(f"lines with word = {tot_line_with_word} without = {tot_line_sans_word}\n") unit_size = spf.hanzi_per_unit(utype) # Nominal avg num of Chinese chars per unit. err.write(f"assumed avg hanzi per unit = {unit_size:8.2f}\n") txpref = name_for_tex_macro(f"{ivt_name}-{ktag}-word-pos") tex_file = f"res/{ivt_name}-{ktag}-wpos-parms.tex" tex_wr = open(tex_file, "w") # These should match the defs from other TeX parms files: tex_wr.write(f"\\def\\{txpref}NumLines{{{tot_line}}}\n") tex_wr.write(f"\\def\\{txpref}TotUnits{{{tot_unit}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}LinesWith{{{tot_line_with_word}}}\n") tex_wr.write(f"\\def\\{txpref}LinesSans{{{tot_line_sans_word}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}HanziPerUnit{{{unit_size:.3f}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}Min{{{wpomin:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}Max{{{wpomax:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}Avg{{{wpoavg:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}Dev{{{wpodev:.2f}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}TotCt{{{tot_wocc}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerLineMinCt{{{nplmin}}}\n") tex_wr.write(f"\\def\\{txpref}PerLineMaxCt{{{nplmax}}}\n") tex_wr.write(f"\\def\\{txpref}PerLineAvgCt{{{nplavg:.2f}}}\n") tex_wr.write(f"\\def\\{txpref}PerLineDevCt{{{npldev:.2f}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}PerLineSecMinCt{{{nplsin}}}\n") tex_wr.write(f"\\def\\{txpref}PerLineSecMaxCt{{{nplsax}}}\n") tex_wr.write(f"\n") tex_wr.close() return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: narg = len(sys.argv) iarg = 1 ivt_name = sys.argv[iarg]; iarg += 1 utype = sys.argv[iarg]; iarg += 1 kword = sys.argv[iarg]; iarg += 1 ktag = sys.argv[iarg]; iarg += 1 main(ivt_name, utype, kword, ktag)