#! /usr/bin/python3 # Last edited on 2026-02-27 07:08:37 by stolfi # Functions for measuring size and listing word position in a # parag. To be included in other python scripts. import sys, os, re from sys import stdout as out, stderr as err, stdin as inp from error_funcs import arg_error, file_line_error, prog_error from process_funcs import bash, basic_line_loop from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats def get_chars_to_ignore(enc, unit): # Returns a set of characters that is to be ignored when # measuring the size of a parag or computing the positions of # a word or pattern in it. ignored_chars = set() if enc == "chu": if unit != "ch": arg_error(f"invalid combo {enc = !r} {unit = !r}") # Read tables of chinese character sets: set_dir = "langbank/chin" ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl") ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl") ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl") ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-punct.tbl") ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-blank.tbl") elif enc == "pys": if unit != "ps": arg_error(f"invalid combo {enc = !r} {unit = !r}") ignored_chars |= set("-,.;:") elif enc == "eva": if unit == "ec": ignored_chars |= set("-,.") elif unit == "wc": # We don't need any ignored chars set. pass elif unit == "wp": # We ignore commas: ignored_chars |= set(",") else: arg_error(f"invalid combo {enc = !r} {unit = !r}") else: arg_error(f"invalid input encoding '{enc}'") return ignored_chars # ---------------------------------------------------------------------- def get_parsing_patterns(enc, unit): # Returns # # {pat_line} RE pattern matching a data line, where group 1 # is the locator (minus '<>') and group 2 is the text. # # {pat_unit} RE pattern matching one unit in the raw text. # # {pat_junk} RE pattern matching any char or substring that should be # deleted before parsing the raw text iinto units. # # {pat_sepa} RE pattern matching one any char that serves # as unit separator in the raw text. # # {clean_sepa} String that should precede, follow, # and separate units in clean text. # # The patterns {pat_junk} and/or {pat_sepa} mau be {None} # if there are supposed to be no chars in that role. # # If {enc} is "chu", the {loc} must be "b{sec}.{subsec}.{lseq}" and # the {text} must be a string of hanzi (Chinese chars) in Unicode # encoding. # # If {enc} is "eva" the, the {loc} must be "f[0-9]*[rv][0-9]*" and # text must be a string of EVA chars [?a-z] or EVA punctuation [-,.] # with optional "<%>" prefix and "<$>" suffix. # # The {unit} specifies what constitutes a unit of the text. It can be: # # "ch" (for {enc="chu"): a Chinese character. # # "ps" (for {enc="pys"): a pinyn syllable. # # "ec" (for {enc="eva"): an EVA character in [?a-z]. # # "wc" (for {enc="eva"): a word, considering comma like period. # # "wp" (for {enc="eva"): a word, ignoring commas. # if enc == "chu" or enc == "pys": pat_loc = r"<(b[1-3][.][0-9][.][0-9][0-9][0-9])>" if enc == "chu": if unit != "ch": arg_error(f"invalid combo {enc = !r} {unit = !r}") pat_unit = r"." # Each char is a unit. pat_ipu = r"[:。,\[\]]" # Ideographic punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ideographic parens). pat_sepa = None # There are no unit separators on input. clean_sepa = "" # No separators in cleaned text. elif enc == "pys": if unit != "ps": arg_error(f"invalid combo {enc = !r} {unit = !r}") pinyin_cons = r"b-df-hj-np-tv-z" # Pinyin consonants. pinyin_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ" pat_syl = f"[{pinyin_cons}]*[{pinyin_vows}]+(r|ng|n|)" pat_unit = f"{pat_syl}" # A unit is a pinyin syllable. pat_ipu = r"[-,.;:'*]" # Pinyin punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ascii parens). pat_sepa = r"[- .,']" # Any of these serves as a unit separator on input. clean_sepa = ' ' # Single blank separator in cleaned text. pat_junk = f"{pat_ipu}|{pat_ann}" # Junk to be deleted. elif enc == "eva": pat_loc = r"<(f[0-9]+[rv][0-9]*[.][0-9]+)>" if unit == "ec": pat_unit = r"[a-z?]" # Unit is a single EVA char. pat_junk = r"[-,. ]" # Any of these should be deleted. pat_sepa = None # There are no unit separators on input. clean_sepa = '' # No unit separator in cleaned text. elif unit == "wc": pat_unit = r"[a-z?]+" # Unit is one or more EVA chars. pat_junk = r"[ ]" # Only blanks can be just deleted pat_sepa = r"[-,.]" # Any of these serves as a unit separator on input. clean_sepa = '.' # Unit separator is '.' in cleaned text. elif unit == "wp": pat_unit = r"[a-z?]+" # Unit is one or more EVA chars. pat_junk = r"[, ]" # Commas too can be deleted in input text. pat_sepa = r"[-.]" # Periods and line break separate units on input. clean_sepa = '.' # Unit separator is '.' in cleaned text. else: arg_error(f"invalid combo {enc = !r} {unit = !r}") assert False pat_line = f"{pat_loc}[ ]+(.*)\n" return pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa # ---------------------------------------------------------------------- def average_word_size(unit): # Nominal average number of {unit}s for a Chinese ideogram word. if unit == "ch" or unit == "ps": return 1.0 elif unit == "wc": return ??? def split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error): # Splits the raw {text}, assumed to be of the type specified by {enc}, # into units of the type specified by {unit}. # # Returns a list {units} with the units, and a count {ct_sepa} of # chars in {text} that were matched by {pat_sepa} (and thus not counted as units). # # Characters matched by {pat_junk} are deleted and not counted. # # Currenly the {enc} must be "chu" (Chinese characters in Unicode), # "pys" (pinyin with separated syllables in Unicode), or "eva" # (EVA-encoded Voynichese). # # The {unit} must be either "ch", "ps", "ec", "wc", or "wp", # as described under {get_parsing_patterns} # # If {enc} is "eva", also deletes inline comments, alignmment and # parag markers, and paired ligature braces '{}'; turns weirdo codes # into '?'; maps everyting to lowercase, all this before splitting # into units and counting the punctuation. # # After that cleanup, the parsing into units uses # {pat_unit} and {pat_sepa}. # Remove junk not counted as punctuation: if enc == "chu": # Nothing special to remove pass elif enc == "pys": # Nothing special to remove pass elif enc == "eva": # First remove junk that should not be there, just in case: text = re.sub(r"[<][!][^<>]*[>]", "", text) text = re.sub(r"[«=» ]", "", text) text = re.sub(r"[{]([^{}]*)[}]", r"\1", text) text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text) text = re.sub(r"^<[%]>", "", text) text = re.sub(r"<[$]>$", "", text) # Map all to lowercse: text = text.lower() else: arg_error(f"invalid encoding {enc = !r}") # Remove simple junk chars: if pat_junk is not None: text = re.sub(pat_junk, '', text) # Now loop on units and separators: ct_sepa = 0 units = [] while len(text) > 0: if pat_sepa is not None: m = re.match(pat_sepa, text) if m is not None: assert m.start() == 0 n = m.end() assert n > 0, "pattern {pat_sepa} matched empty" ct_sepa += n text = text[n:] continue m = re.match(pat_unit, text) if m is not None: assert m.start() == 0 n = m.end() assert n > 0, "pattern {pat_unit} matched empty" units.append(text[0:n]) text = text[n:] continue data_error(f"invalid chars = '{text}'") return units, ct_sepa # ---------------------------------------------------------------------- def list_occurrences(word, units, clean_sepa, unit, data_error): # Returns a list of the occurrences of {word} in # the text that consists of the list of {units} of type {unit} # preceded, separated, and terminated by {clean_sepa}. if len(units) == 0: return [] clean_text = clean_sepa + clean_sepa.join(units) + clean_sepa if unit == "ch": m = re.search(r"[\000-\377]|[ :。,()]", clean_text) if m is not None: data_error(f"invalid character '{m.group(0)}' in hanzi text") oclist = list_ch_occurrences(word, clean_text) elif unit == "ps": m = re.search(r"[.,:;()*!?]", clean_text) if m is not None: data_error(f"invalid character '{m.group(0)}' in pinyin text") oclist = list_ps_occurrences(word, clean_text) elif unit == "ec": m = re.search(r"[^a-z?]", clean_text) if m is not None: data_error(f"invalid character '{m.group(0)}' in word-less EVA text") oclist = list_ec_occurrences(word, clean_text) elif unit == "wc" or unit == "wp": m = re.search(r"[^.a-z?]", clean_text) if m is not None: data_error(f"invalid character '{m.group(0)}' in word-split EVA text") oclist = list_wc_or_wp_occurrences(word, clean_text) else: arg_error(f"invalid {unit = !r}") return oclist # ---------------------------------------------------------------------- def list_ch_occurrences(word, clean_text): # Finds occurrences of {word} as string in the Unicode CHU-8 hanzi # {clean_text}, returning their positions. The text {clean_text} should have no # iso-latin-1 chars or hanzi punctuation. # # Positions are measured in hanzi chars. oclist = list(( p.start() for p in re.finditer(word, clean_text) )) return oclist # ---------------------------------------------------------------------- def list_ps_occurrences(word, clean_text): # Seach {word} in {clean_text} matching only whole syllables. The # {clean_text} text should have single ' ' before, between, and after # each syllable. # # Positions are measured in pinyin syllables. debug = False assert clean_text[0] == ' ' and clean_text[-1] == ' ', "cleanup failed" # We must loop on syllables: rest = clean_text word = r"\b" + word + r"\b" # Ensure that {word} matches only whole sylls. oclist = [] pref = "" while True: if debug: err.write(f"!* '{pref}|{rest}'\n") m = re.search(word, rest) if m is None: break ix = m.start(); fx = m.end() if debug: err.write(f"!* {ix = } {fx = }\n") assert ix < len(rest) # If {word} matched just before ' ', adjust {ix} to exclude the ' ': if rest[ix] == ' ': ix = ix + 1 # If {word} matched just after ' ', adjust {fx} to exclude the ' ': if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 if debug: err.write(f"!* adjusted {ix = } {fx = }\n") # Make sure that match is whole words: assert ix > 0 and ix < len(rest) and rest[ix-1] == ' ' assert fx > ix, "the pattern {word = !r} matched the empty string" assert fx < len(rest) and rest[fx] == ' ' # Count whole words before occurrence of {word}:] pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:] if debug: err.write(f"!* '{pref}[{core}]{rest}'\n") oc = len(re.findall(r"[ ]", pref)) - 1 if debug: err.write(f"!* {oc = }'\n") oclist.append(oc) pref = pref + core return oclist # ---------------------------------------------------------------------- def list_ec_occurrences(word, clean_text): # Finds occurrences of {word} as string in the EVA {clean_text}, returning # their positions. Assumes that all blanks and EVA puntuation [-,.] # have been deleted. # # Positions are measured in EVA character counts oclist = list(( p.start() for p in re.finditer(word, clean_text) )) return oclist # ---------------------------------------------------------------------- def list_wc_or_wp_occurrences(word, clean_text): # Finds occurrences of {word} in the EVA {clean_text}, returning their # positions. # # Assumes that blanks, other EVA junk, and unwanted separators ([,] # for "wp") have been deleted, relevant unit separators ([-,.] for # "wc", [-.] for "wp") have been mapped to '.', and there is a single # '.' before, between, and after every unit, # # Positions are measured in EVA words. Unmatched EVA word suffixes or # prefizes are counted as 0.5 word. assert clean_text[0] == '.' and clean_text[-1] == '.', "cleanup failed" debug = False # We must loop on words: rest = '.' + clean_text + '.' oclist = [] pref = "" while True: if debug: err.write(f"!* '{pref}|{rest}'\n") m = re.search(word, rest) if m is None: break ix = m.start(); fx = m.end() if debug: err.write(f"!* {ix = } {fx = }\n") assert ix < len(rest) # If {word} matched just before '.', adjust {ix} to exclude the '.': if rest[ix] == '.': ix = ix + 1 # If {word} matched just after ' ', adjust {fx} to exclude the ' ': if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 if debug: err.write(f"!* adjusted {ix = } {fx = }\n") # Count whole words before occurrence of {word}:] pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:] if debug: err.write(f"!* '{pref}[{core}]{rest}'\n") oc = len(re.findall(r"[.]", pref)) - 1 if debug: err.write(f"!* {oc = }'\n") # If {word} matched only a suffix, count the prefix as half a word: if pref != '' and pref[-1] != '.': oc += 0.5 oclist.append(oc) pref = pref + core return oclist # ---------------------------------------------------------------------- def test_stuff(): def data_error(msg): err.write(f"** {msg}\n"); assert False # .................................................................... for enc, unit in ( ("eva","wc",), ("eva","wp",), ("eva","ec",), ("chu","ch",), ("pys", "ps",) ): err.write(f"!* counting '{unit}' units in {enc} text\n") ignored_chars = get_chars_to_ignore(enc, unit) err.write(f"!* {ignored_chars = !r}\n") pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa = \ get_parsing_patterns(enc, unit) if enc == "eva": text = "foo.bar.,baz.-qofoo,barrifoo" words = [ "foo", "bar", ] if unit == "wc" or unit == "wp": words += [ "[.]foo", "foo[.]", "[.]foo[.]", "foo[.]bar", ] clean_sepa = '.' elif unit == "ec": clean_sepa = '' elif enc == "pys": text = " tóng yè wèi. zhǔ zhì è zhe zhǔ zhì yīn. pí, zhǔ wǔ zhì, shāng ǔ. ǔ." words = [ "zhǔ", "zhǔ zhì" ] assert unit == "ps" clean_sepa = ' ' elif enc == "chu": text = "白石英:味甘微温。主治消渴,阴痿不足,欬逆白石,青石黑石脂等:味甘平。主治黄疸,泄利" words = [ "白石", "主治" ] assert unit == "ch" clean_sepa = '' err.write(f"!* raw text = '{text}')\n") units, ct_sepa = \ split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error) err.write(f"!* {ct_sepa = } {units = !r}\n") err.write("\n") for word in words: err.write(f"!* looking up '{word}'\n") oclist = list_occurrences(word, units, clean_sepa, unit, data_error) err.write(f"!* found = {oclist}\n") err.write("\n") return # ---------------------------------------------------------------------- if len(sys.argv) > 0 and sys.argv[1] == "test": test_stuff()