#! /usr/bin/python3 # Last edited on 2026-03-29 20:47:27 by stolfi # Functions for measuring size and listing word position in a # parag. To be included in other python scripts. import sys, os, re from sys import stdout as out, stderr as err, stdin as inp from error_funcs import arg_error, file_line_error, prog_error from process_funcs import bash, basic_line_loop from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats from math import sqrt, hypot, exp, log, pi, inf, nan, floor, ceil def clean_up_raw_text(text, utype, data_error): # Cleans up the raw {text} from an SBJ or SPS transcription file according to {utype} # for use in Note 077 analysis. # # See {clean_up_bencao_raw_text} for the meaning of {utype}s "ch" and "ps". # See {clean_up_starps_raw_text} for the meaning of {utype}s "ec", "wp", and "wc". # if utype == "ch" or utype == "ps": text = clean_up_bencao_raw_text(text, utype, data_error) elif utype == "ec" or utype == "wc" or utype == "wp": text, head, tail = clean_up_starps_raw_text(text, utype, data_error) return text # ---------------------------------------------------------------------- def normalize_raw_text(text, utype, data_error): # Modifies a raw {text} from an SBJ or SPS transcription # file according to {utype} for use in Note 077 analysis, by replacing # certain substrings that are assumed to be scribal errors, # abbreviations, or non-significant variants. # # See {normalize_bencao_raw_text} for the meaning of {utype}s "ch" and "ps". # See {normalize_starps_raw_text} for the meaning of {utype}s "ec", "wp", and "wc". # if utype == "ch" or utype == "ps": text = normalize_bencao_raw_text(text, utype, data_error) elif utype == "ec" or utype == "wc" or utype == "wp": text = normalize_starps_raw_text(text, utype, data_error) return text # ---------------------------------------------------------------------- def clean_up_starps_raw_text(text, utype, data_error): # Cleans up the raw {text} from an SPS transcription file according to {utype} # for use in Note 077 analysis. # # The function returns the cleaned {text} and booleans {head} and # {tail} that specify whether the line was a parag head line or parag # tail line, respectively. Note that both may be true for a one-line # parag. # # INPUT TEXT EXPECTED # # The raw {text} given must be a non-empty string that may contain # any of these: # # Prefix "<%>" marking the line as the head of a paragraph. # # Prefix [«=»] to indicate the alignment of the start of the line # relative to the left rail. # # EVA letters [A_Za-z]. # # Invalid EVA code '?'. # # Weirdo codes "&{NNN};" (possibly without the ';') # where {NNN} is three decimal digits. # # Ligatures consisting of two or more EVA letters, '?', or # weirdo codes enclosed in braces "{...}". # # Word separators [-.,]. # # Suffix [«=»] to indicate the alignment of the end of the line # relative to the right rail. # # Suffix "<$>" to mark the line as the tail of a parag. # # Inline comments "", including various special comments # to indicate stars in the margin, wide linegaps, figure intrusions. # vellum folds, etc. These may appear anywhere in the line # including before or after all the {text} and inside other markup # tags. # # Example: # # "={Ch}eeo,daiin.{Sh}eedy&162&211am.{Ch}odal.ol{Ch}edy«<$>" # "<%>=w{Ch}edairs.oeail.{Ch}otaropdaiin.otol,dair.aiir,aim=" # "={Ch}ol.oeedy.keedy.{Ch}{Ch}eky{air.ar.{Sh}ol.{Ch}edy.ot{Ch}edy.{Qo}ty=" # # Note the parag break between the first and second text. # # CLEANUP PERFORMED # # For all {utype}s, the cleanup entails: # # Removing all inline comments. # # Removing all markup, including the braces [{}], parag markers "<%>" and "<$>", # and rail alignment markers [«=»]. # # Mapping 'w' and 'z', used in my transcription to encode hooked puffs, # to 'p' and 'f', respctively. # # Mapping the uncertain reading charaters [bj] to '?'. # # Mapping all weirdo codes to '?'. # # Mapping everything to lowercase. # # Moreover, # # If {utype} is "ec", all punctuation [-,.] is deleted. # # If {utype} is "wp", every ',' is deleted, and '-' is mapped to '.' # # If {utype} is "wc", every ',' and '-' are mapped to '.' # # CLEANED TEXT # # After this cleanup, if {utype} is "ec", the text must have only # characters [ac-fhik-tvxy?]. Example: # # "choloeedykeedychchekycheod?eyke??edeedydaiiinaldair" # # If {utype} is "wc" or "wp", the raw text may also have '.' separators. Example: # # "chol.oeedy.keedy.chcheky.cheod?ey.ke??edeedy.daiiin.ald.air" # # For more restrictions, see {check_starps_cleanup} below. # text = text.strip() if utype == "ec" or utype == "wc" or utype == "wp": # Remove inline comments: text = re.sub(r"[<][!][^<>]*[>]", "", text) # Map weirdo codes to '?': text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text) # Remove ligature marks '{}': text = re.sub(r"[{]([^{}]*)[}]", r"\1", text) # Map to lowercase: text = text.lower() # Map hooked puffs to normal ones: text = re.sub(r"w", "p", text) text = re.sub(r"z", "f", text) # Map rare uncertain characters to '?': text = re.sub(r"[bj]", "?", text) # Check for parag markers: head = ( re.search(r"^[<][%][>]", text) != None ) tail = ( re.search(r"[<][$][>]$", text) != None ) # Remove all parag markers for now: text = re.sub(r"[<][%$][>]", "", text) # Remove alignment markers: text = re.sub(r"^[«=»]", "", text) text = re.sub(r"[«=»]$", "", text) # Check for invalid characters in input: m_bad = re.search(r"[^-.,ac-ik-z?]", text) if m_bad is not None: ibeg = m_bad.start() data_error(f"invalid character '{text[ibeg]}' at {ibeg}") # Check for irregular spaces in input: m_bad = re.search(r"^[-,.]|[-,.][-,.]|[-,.]$", text) if m_bad is not None: ibeg = m_bad.start(); iend = m_bad.end() data_error(f"improper punctuation '{text[ibeg:iend]}' at {ibeg}") # Replace punctuation: text = re.sub(r"-", ".", text) if utype == "ec": text = re.sub(r"[,.]", "", text) elif utype == "wc": text = re.sub(r"[,]", ".", text) elif utype == "wp": text = re.sub(r"[,]", "", text) else: assert False # Should not happen. else: arg_error(f"invalid {utype = !r}") check_starps_cleanup(text, utype, data_error) return text, head, tail # ---------------------------------------------------------------------- def clean_up_bencao_raw_text(text, utype, data_error): # Cleans up the raw {text} from an SBJ transcription file according to {utype} # for use in Note 077 analysis. Returns the cleaned {text}. # # INPUT TEXT EXPECTED # # For any {utype} the input text may contain editorial annotations and markup. # The details depend on {utype}: # # "ch": the input raw {text} must consist of simplified hanzi # (Chinese characters) in Unicode, with ideographic punctuation # and markup characters [][;:,。]. It is assumed that # editorial annotations delimited by ideographic parens(...)were # moved to #-comments, but some tags like (女子)remain, and they # are not removed. The raw {text} should not have embedded ASCII # blanks or other ASCII chars. Example: # # "冬葵子:[味]甘寒。[主治]五藏六腑寒热羸瘦。[久服]坚骨长肌肉。(草头)" # # "ps": the raw {text} must be a string of isolated pinyin with # syllables separated by blanks or ASCII punctuation. There may be # tags like (nǚ zǐ) in ASCII parentesis, other editrial comments # should have been removed. The {text} may constain apostrophes, # asterisks, and other ASCII markup, . Example: # # "mǔ lì: [wèi] xián píng. [zhǔ zhì] shāng hán hán*(cǎo tóu) rè. [shēng] chí zé." # # CLEANUP # # Cleanup for both {utype}s implies: # # Replacing all punctuation by spaces. # # If {utype} is "ps", the cleanup then entails removing # extrs spaces, leavingonly one space between syllables. # # CLEANED TEXT # # After this cleanup, if {utype} is "ch", the raw text must # consist entirely of simplified hanzi. Example: # # "冬葵子味甘寒主治五藏六腑寒热羸瘦久服坚骨长肌肉" # # If {utype} is "ps", the text must consist enturely of pinyin # syllables, separated by single spaces. Example: # # "mǔ lì wèi xián píng zhǔ zhì shāng hán hán rè shēng chí zé" # if utype == "ch": # # Remove annotations (enclosed in ideographic parens): # text = re.sub(r"[(][^()]*[)]", "", text) # Remove all punctuation: */ text = re.sub(r"[][;:,。()]", "", text) elif utype == "ps": # Remove asterisks: text = re.sub(r"[*]", "", text) # # Remove annotations enclosed in ASCII parens: # text = re.sub(r"[(][^()]*[)]", "", text) # Remove all punctuation, leave normalized blanks: */ text = re.sub(r"[][',.:;()]", " ", text) text = re.sub(r"[ ][ ]+", " ", text) text = text.strip() else: arg_error(f"invalid text unit type {utype = !r}") check_bencao_cleanup(text, utype, data_error) return text # ---------------------------------------------------------------------- def normalize_starps_raw_text(text, utype, data_error): # Normalizes the raw {text} from an SPS transcription file according to {utype} # for use in Note 077 analysis. Returns the normalized text # # For all {utype}s, the normalization entails fixing assumed scribal # mistakes and assumed non-significant handwriting variation, # by mapping # # 'g','m' to 'il' # 'u' to 'n' # 'hh' to 'he' # 'ih' to 'ch' # 'iGh' to 'cGh' for any gallows G. # if utype == "ec" or utype == "wc" or utype == "wp": # Reduce presumed handwriting variants: text = re.sub(r"g", "m", text) text = re.sub(r"u", "n", text) # Correct presumed scribal and transcription errors and abbreviations: text = re.sub(r"ir", "iin", text) text = re.sub(r"m", "il", text) text = re.sub(r"hh", "he", text) text = re.sub(r"ih", "ch", text) text = re.sub(r"i([ktpf])h", r"c\1h", text) else: arg_error(f"invalid {utype = !r}") return text # ---------------------------------------------------------------------- def normalize_bencao_raw_text(text, utype, data_error): # Normalizes the raw {text} from an SBJ transcription file according to {utype} # for use in Note 077 analysis. Returns the normalized {text}. # # The input text {text} must have the punctuation, including ideographic # brackets '' around keywords 味, 主治, 久服, etc. # # Cleanup for both {utype}s implies: # # Deleting entry fields that apparently are not transcribed in the SPS: # # [味]... [wèi] "taste and warmth" # [一名]... [yī míng] "another name" # [生 ]... [shēng] "place of origin" # # Deleting the sub-entry "鸡白蠹:肥猪。" = "jī bái dù: féi zhū." # which is a vet/farming use. # # Normalizing some keys: # # [治] [zhì] to [主治] [zhǔ zhì] # [主] [zhǔ] to [主治] [zhǔ zhì] # [久服]之 [jiǔ fú] zhī to [久服] [jiǔ fú] # [久食] [jiǔ shí] to [久服] [jiǔ fú] # # if utype == "ch": check_bencao_cleanup(text, utype, data_error) # Remove fields apparently omitted from the SPS: text = re.sub(r"\[(味|一名|生)\][^][]*", "", text) text = re.sub(r"鸡白蠹[:,]?肥猪[。]?", "", text) # Vet/farming use. # Regularize field keywords: text = re.sub(r"\[治\]", "[主治]", text) # On b1.1.002. text = re.sub(r"\[主\]", "[主治]", text) text = re.sub(r"\[久服之\]", "[久服]", text) # On . text = re.sub(r"\[久食\]", "[久服]", text) elif utype == "ps": # Remove entries apparently omitted from the SPS: text = re.sub(r"\[(wèi|yī[ ]+míng|shēng)\][^][]*", "", text) text = re.sub(r" *\bjī[ ]+bái[ ]+dù[,:]?[ ]*féi[ ]+zhū\b[.]? *", " ", text) # Vet/farming use. # Regularize field keywords: text = re.sub(r"\[zhì]\]", "[zhǔ zhì]", text) # On b1.1.002. text = re.sub(r"\[zhǔ]\]", "[zhǔ zhì]", text) # On b1.1.002. text = re.sub(r"\[jiǔ[ ]+fú[ ]+zhī\]", "[jiǔ fú]", text) # On . text = re.sub(r"\[jiǔ[ ]+shí\]", "[jiǔ fú]", text) else: arg_error(f"invalid text unit type {utype = !r}") return text # ---------------------------------------------------------------------- def check_starps_cleanup(text, utype, data_error): # Checks whether {text} is a valid "clean" SPS text according # to the given {utype}. Calls {data-error} if not. # Check for invalid characters: if utype == "ec": m_bad = re.search(r"[^ac-ik-vxy?]", text) elif utype == "wc" or utype == "wp": m_bad = re.search(r"[^.ac-ik-vxy?]", text) else: arg_error(f"invalid {utype = !r}") if m_bad is not None: ibeg = m_bad.start() data_error(f"invalid text char '{text[ibeg]}' at pos {ibeg}") if utype == "wc" or utype == "wp": # Check for improper spacing: pat_bad_punc = r"^[.]|[.][.]|[.]$" m_bad = re.search(pat_bad_punc, text) if m_bad != None: ibeg = m_bad.start() data_error(f"improper punctuation '{text[ibeg]}' at pos {ibeg}") return # ---------------------------------------------------------------------- def check_bencao_cleanup(text, utype, data_error): # Checks whether {text} is a valid "clean" SBJ text according # to the given {utype}. Calls {data-error} if not. # # !!! Currently does not do a thorough check. Impreove! !!! if utype == "ch": # Check for ASCII characters: m_bad = re.search(r"[\001-\377]", text) elif utype == "ps": # Check for characters not valid in pinyin: pn_cons = r"b-df-hj-np-tv-z" # Pinyin consonants. pn_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ" pn_letr = pn_cons + pn_vows m_bad = re.search(f"[^ {pn_letr}]", text) if m_bad is not None: ibeg = m_bad.start() data_error(f"invalid text char '{text[ibeg]}' at {ibeg}") return # ---------------------------------------------------------------------- def get_parsing_patterns(utype): # Returns # # {pat_line} RE pattern matching a data line, where group 1 # is the locator (minus '<>') and group 2 is the text. # # {pat_unit} RE pattern matching one unit in the raw text. # # {pat_sepa} RE pattern matching any char that serves # as unit separator in the clean text. # # {clean_sepa} String that should separate units in clean text. # # The pattern {pat_sepa} may be {None} # if there are supposed to be no chars in that role. # # See {clean_up_starps_raw_text} and {clean_up_raw_bencao_text} for # the semantics of {utype}. # # The locus ID format {loc} depends on {utype}: # # "ch" or "ps": the {loc} must be "b{sec}.{subsec}.{lseq}". # # "ec", "wp", or "wc": the {loc} must be "f[0-9]*[rv][0-9]*". # # Unit of text size/position: # # "ch": The units of counting is one hanzi. # # "ps": The unit of counting is the syllable. # # "ec": a single eva character [?a-z]. # # "wc" and "wc": one EVA token, a string [?a-z]+ delimited by '.' # if utype == "ch" or utype == "ps": pat_loc = r"<(b[1-3][.][0-9][.][0-9][0-9][0-9])>" if utype == "ch": pat_unit = r"." # Each char is a unit. pat_sepa = None # There are no unit separators on input. clean_sepa = "" # No separators in cleaned text. elif utype == "ps": pinyin_cons = r"b-df-hj-np-tv-z" # Pinyin consonants. pinyin_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ" pat_syl = f"[{pinyin_cons}]*[{pinyin_vows}]+(r|ng|n|)" pat_unit = f"{pat_syl}" # A unit is a pinyin syllable. pat_sepa = r"[ ]" # Single blank is separator in clean text. clean_sepa = ' ' # Single blank is separator in cleaned text. elif utype == "ec" or utype == "wc" or utype == "wp": pat_loc = r"<(f[0-9]+[rv][0-9]*[.][0-9]+)>" if utype == "ec": pat_unit = r"[a-z?]" # Unit is a single EVA char. pat_sepa = None # There are no unit separators on input. clean_sepa = '' # No unit separator in cleaned text. elif utype == "wc" or utype == "wp": pat_unit = r"[a-z?]+" # Unit is one or more EVA chars. pat_sepa = r"[.]" # In clean text only '.' is separator. clean_sepa = '.' # Unit separator is '.' in cleaned text. else: arg_error(f"invalid {utype = !r}") pat_line = f"{pat_loc}[ ]+(.*)" return pat_line, pat_unit, pat_sepa, clean_sepa # ---------------------------------------------------------------------- def hanzi_per_unit(utype): # Nominal average number of units of type {utype} for a Chinese ideogram word. if utype == "ch" or utype == "ps": return 1.0000 elif utype == "wc": return 0.7952 elif utype == "wp": return 0.8994 elif utype == "ec": return 0.1568 else: assert False, f"invalid {utype = !r}" # ---------------------------------------------------------------------- def split_text_into_units(text, utype, pat_unit, pat_sepa, data_error): # Splits cleaned {text} into units of the type specified by {utype}. # Assumes that the txt was cleaned according to {utype}. # # Returns a list {units} with the units, and a count {ct_sepa} of # chars in {text} that were matched by {pat_sepa} (and thus not counted as units). # # Currenly the {utype} must be "ch" (Chinese characters in Unicode), # "ps" (pinyin with separated syllables in Unicode), "ec", "wc", or "wp" # (EVA-encoded Voynichese). # # Loop on units and separators: ct_sepa = 0 units = [] while len(text) > 0: if pat_sepa is not None: m = re.match(pat_sepa, text) if m is not None: assert m.start() == 0 n = m.end() assert n > 0, "pattern {pat_sepa} matched empty" ct_sepa += n text = text[n:] continue m = re.match(pat_unit, text) if m is not None: assert m.start() == 0 n = m.end() assert n > 0, "pattern {pat_unit} matched empty" units.append(text[0:n]) text = text[n:] continue data_error(f"invalid chars = '{text}'") return units, ct_sepa # ---------------------------------------------------------------------- def list_occurrences(word, units, clean_sepa, utype, data_error): # Returns a list of the occurrences of {word} in # the text that consists of the list {units} of units of type {utype} # preceded, separated, and terminated by {clean_sepa}. # # Note that the bracketing {clean_sepa} make the text not valid. # They are added to simplify matching whole words. # # The {word} may be an RE pattern and/or may include {clean_sepa} # to control matches. # # !!! Misses overlapping occurrences. Is that OK? !!! if len(units) == 0: return [] btext = clean_sepa + clean_sepa.join(units) + clean_sepa if utype == "ch": oclist = list_ch_occurrences(word, btext) elif utype == "ps": oclist = list_ps_occurrences(word, btext) elif utype == "ec": oclist = list_ec_occurrences(word, btext) elif utype == "wc" or utype == "wp": oclist = list_wc_or_wp_occurrences(word, btext) else: arg_error(f"invalid {utype = !r}") return oclist # ---------------------------------------------------------------------- def list_ch_occurrences(word, btext): # Finds occurrences of {word} as string in the Unicode CHU-8 hanzi # {btext}, returning their positions. The text {btext} should have no # iso-latin-1 chars or hanzi punctuation. # # Positions are measured in hanzi chars. oclist = list(( p.start() for p in re.finditer(word, btext) )) return oclist # ---------------------------------------------------------------------- def list_ps_occurrences(word, btext): # Seach {word} in {btext} matching only whole syllables. The # {btext} text should have single ' ' before, between, and after # each syllable. # # Positions are measured in pinyin syllables. debug = False assert btext[0] == ' ' and btext[-1] == ' ', "bug padding" # We must loop on syllables: rest = btext word = r"\b" + word + r"\b" # Ensure that {word} matches only whole sylls. oclist = [] pref = "" while True: if debug: err.write(f"!* '{pref}|{rest}'\n") m = re.search(word, rest) if m is None: break ix = m.start(); fx = m.end() if debug: err.write(f"!* {ix = } {fx = }\n") assert ix < len(rest) # If {word} matched just before ' ', adjust {ix} to exclude the ' ': if rest[ix] == ' ': ix = ix + 1 # If {word} matched just after ' ', adjust {fx} to exclude the ' ': if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 if debug: err.write(f"!* adjusted {ix = } {fx = }\n") # Make sure that match is whole words: assert ix > 0 and ix < len(rest) and rest[ix-1] == ' ' assert fx > ix, "the pattern {word = !r} matched the empty string" assert fx < len(rest) and rest[fx] == ' ' # Count whole words before occurrence of {word}:] pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:] if debug: err.write(f"!* '{pref}[{core}]{rest}'\n") oc = len(re.findall(r"[ ]", pref)) - 1 if debug: err.write(f"!* {oc = }'\n") oclist.append(oc) pref = pref + core return oclist # ---------------------------------------------------------------------- def list_ec_occurrences(word, btext): # Finds occurrences of {word} as string in the EVA {btext}, returning # their positions. Assumes that all blanks and EVA puntuation [-,.] # have been deleted. # # Positions are measured in EVA character counts oclist = list(( p.start() for p in re.finditer(word, btext) )) return oclist # ---------------------------------------------------------------------- def list_wc_or_wp_occurrences(word, btext): # Finds occurrences of {word} in the EVA {btext}, returning their # positions. # # Assumes that blanks, other EVA junk, and unwanted separators ([,] # for "wp") have been deleted, relevant unit separators ([-,.] for # "wc", [-.] for "wp") have been mapped to '.', and there is a single # '.' before, between, and after every unit, # # Positions are measured in EVA words. Unmatched EVA word suffixes or # prefizes are counted as 0.5 word. assert btext[0] == '.' and btext[-1] == '.', "cleanup failed" debug = False # We must loop on words: rest = btext oclist = [] pref = "" while True: if debug: err.write(f"!* '{pref}|{rest}'\n") m = re.search(word, rest) if m is None: break ix = m.start(); fx = m.end() if debug: err.write(f"!* {ix = } {fx = }\n") assert ix < len(rest) # If {word} matched just before '.', adjust {ix} to exclude the '.': if rest[ix] == '.': ix = ix + 1 # If {word} matched just after '.', adjust {fx} to exclude the '.': if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 if debug: err.write(f"!* adjusted {ix = } {fx = }\n") # Count whole words before occurrence of {word}:] pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:] if debug: err.write(f"!* '{pref}[{core}]{rest}'\n") oc = len(re.findall(r"[.]", pref)) - 1 if debug: err.write(f"!* {oc = }'\n") # If {word} matched only a suffix, count the prefix as half a word: if pref != '' and pref[-1] != '.': oc += 0.5 oclist.append(oc) pref = pref + core return oclist # ---------------------------------------------------------------------- def find_keywords(text, kword): # Splits {text} at every occurrence of the pattern {kword}. Returns # {gaps,gsizes,hits,hsizes} where {hits} is a list of the {nh} # occurrences of {kword} in {text}, {gaps} is a list of the {ng=nh+1} # strings before, between,and after those strings, and {gsizes,hsizes} # are lists of the lengths of those strings. chops = re.split(f"({kword})", text) assert len(chops) >= 1 ng = (len(chops) + 1) // 2 # Number of actual gaps. nh = ng - 1 # Number of actual hits. assert len(chops) == 2*ng - 1 gaps = [ chops[2*i] for i in range(nh+1) ] hits = [ chops[2*i + 1] for i in range(nh) ] gsizes = [ len(g) for g in gaps ] hsizes = [ len(h) for h in hits ] return gaps, gsizes, hits, hsizes # ---------------------------------------------------------------------- def compute_total_size_range(gsizes, hsize): # Computes a total size range # given the estimated integer ranges {gsizes[0..ng-1]} of gap sizes # between certain keywords and and the estimated size range # {hsize} for the separating . ng = len(gsizes); nh = ng - 1; tsize_lo = nh*hsize[0]; tsize_hi = nh*hsize[1] for egs in gsizes: tsize_lo += egs[0] tsize_hi += egs[1] return (tsize_lo, tsize_hi,) # ---------------------------------------------------------------------- def round_est_size(sz, eps_log): # Rounds an estimated size after adding # {eps_log} in log scale. Rounds down if {eps_log} # is negative, up otherwise. if eps_log < 0: rsz = int(floor(exp(log(sz) + eps_log))) assert rsz <= sz elif eps_log > 0: rsz = int(ceil(exp(log(sz) + eps_log))) assert rsz >= sz else: rsz = int(sz + 0.5) return rsz # ---------------------------------------------------------------------- def parse_size_ranges(szranges_str): # The list {szranges_str} must be a string consisting of {neg} items # separated by commas, where each item is either an integer or a pair of # integers separated by '..'. They are converted to a list # {szranges[0..neg-1]} of integer pairs {(lo,hi)}. # xitems = re.split(r"[,]", szranges_str) szranges = [] for xit in xitems: xit = xit.strip() if xit != "": m = re.fullmatch(r"([0-9]+)[.][.]([0-9]+)", xit) if m != None: it_min = int(m.group(1)) it_max = int(m.group(2)) assert 1 <= it_min and it_min <= it_max, f"bad expected range {it_min}..{it_max}" szranges.append((it_min,it_max,)) else: it_val = int(xit) szranges.append((it_val,it_val,)) return szranges # ---------------------------------------------------------------------- def format_range(szrange): los = szrange[0]; his = szrange[1] if los == his: return str(los) else: return str(los) + ".." + str(his) # ---------------------------------------------------------------------- def format_size_ranges(szranges): szranges_str = '' for szrange in szranges: if szranges_str != '': szranges_str += ',' egs_str = format_range(szrange) szranges_str += egs_str return szranges_str # ---------------------------------------------------------------------- def combine_gaps_and_hits(gaps, gsizes, new_gsizes, hits, hsizes, new_hsizes): # Concatenates {gaps} and {hits} to match the lists {new_gsizes}. # Puts brackets around hits. ng = len(gsizes) assert len(hsizes) == ng - 1 nog = len(new_gsizes) assert len(new_hsizes) == nog - 1 assert nog <= ng fgaps = [ '', ] fgsizes = [ 0 ] fhits = [] fhsizes = [] iog = 0 for ig in range(ng): h = '' if ig == 0 else '[' + hits[ig-1] + ']' hs = 0 if ig == 0 else hsizes[ig-1] g = gaps[ig] gs = gsizes[ig] if fgsizes[iog] + hs + gs <= new_gsizes[iog]: fgaps[iog] += h + g fgsizes[iog] += hs + gs elif fgsizes[iog] == new_gsizes[iog]: fhits.append(h); fhsizes.append(hs) fgaps.append(g); fgsizes.append(gs) iog += 1 assert iog == len(fgsizes) - 1 else: err.write(f"!* {gaps = !r}\n") err.write(f"!* {hits = !r}\n") err.write(f"!* {gsizes = !r}\n") err.write(f"!* {hsizes = !r}\n") err.write(f"!* {new_gsizes = !r}\n") err.write(f"!* {new_hsizes = !r}\n") err.write(f"!* {iog = !r}\n") err.write(f"!* {fgaps = !r}\n") err.write(f"!* {fhits = !r}\n") err.write(f"!* {fgsizes = !r}\n") err.write(f"!* {fhsizes = !r}\n") assert False # Paranoia: assert len(fgaps) == nog assert len(fgsizes) == nog assert len(fhits) == nog - 1 assert len(fhsizes) == nog - 1 for iog in range(nog): assert fgsizes[iog] == new_gsizes[iog] if iog > 0: assert fhsizes[iog-1] == new_hsizes[iog-1] return fgaps, fgsizes, fhits, fhsizes # ---------------------------------------------------------------------- def test_stuff(): def data_error(msg): err.write(f"** {msg}\n"); assert False # .................................................................... for utype in ( "wc", "wp", "ec", "ch", "ps" ): err.write(f"!* counting '{utype}' units\n") pat_line, pat_unit, pat_sepa, clean_sepa = \ get_parsing_patterns(utype) if utype == "ec" or utype == "wc" or utype == "wp": text = "foo,par.paz-q.?ofoo,parrifoo" words = [ "foo", "par", ] if utype == "wc" or utype == "wp": words += [ "[.]foo", "foo[.]", "[.]foo[.]", "foo[.]par", ] assert clean_sepa == '.' elif utype == "ec": assert clean_sepa == '' elif utype == "ps": text = " tóng yè wèi. zhǔ zhì è zhe zhǔ zhì yīn. pí, zhǔ wǔ zhì, shāng ǔ. ǔ." words = [ "zhǔ", "zhǔ zhì" ] assert clean_sepa == ' ' elif utype == "ch": text = "白石英:[味]甘微温。[主治]消渴,阴痿不足,欬逆白石,青石黑石脂等:[味]甘平。[主]黄疸,泄利" words = [ "白石", "主治" ] assert clean_sepa == '' else: assert False err.write(f"!* raw text = '{text}')\n") text = clean_up_raw_text(text, utype, data_error) err.write(f"!* cleaned text = '{text}')\n") units, ct_sepa = \ split_text_into_units(text, utype, pat_unit, pat_sepa, data_error) err.write(f"!* {ct_sepa = } {units = !r}\n") err.write("\n") for word in words: err.write(f"!* looking up '{word}'\n") oclist = list_occurrences(word, units, clean_sepa, utype, data_error) err.write(f"!* found = {oclist}\n") err.write("\n") return # ---------------------------------------------------------------------- if len(sys.argv) > 1 and sys.argv[1] == "test": test_stuff()