#! /bin/python3 # Last edited on 2026-04-28 13:09:48 by stolfi # Functions for aligning micro-parsed SBJ entries with SPS word text. import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set import size_position_funcs as spf import write_parsing_funcs as wpf from math import sqrt, hypot, exp, log, pi, inf, nan, floor, ceil def align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \ ( items_tg, items_wh, items_en, segs_ch, text_wc, segs_ec ): # Takes # # {items_tg} a list of tag strings for the items of the micro-parsing # {items_wh} below. # # {items_wh} a micro-parsing of an SBJ entry {E} into "items" suitable # for {h77.three_column_entry_table}. # # {items_en} a micro-parsing of the English translation of {E}, # aligned with {items_wh}. # # {segs_ch} a macro-parsing of {E} into alternating gaps and keyword hits. # # {text_wc} the text of of an SPS parag {P} that is supposed to be the # translation of {E}, with word separators. # # {segs_ec} the macro-parsing of {P} that is supposed to match {segs_ch}. # # The procedure splits {text_wc} and refines {items_tg}, {items_wh}, {items_en} # producing four parallel lists {bites_tg}, {bites_wc}, {bites_wh}, {bites_en} of # strings, such that # # {bites_tg[ib]} is a tag assigned to {bites_wh[ib]} (see below) # # {bites_wh[ib]} is some item from {items_wh[0..nt-1]}, or a piece # thereof; # # {bites_wc[ib]} is the approximate EVA word or words corresponding # to the hanzi characters in {bites_wh[ib]} # # {bites_en[ib]} is the English translation of {bites_wh[ib]}. # # The strings in the micro-parsing {items_wh} of {E} must consist of # hanzi characters possibly interspersed with ideographic punctuation # and ideographic spaces. # # The # strings {segs_ch[0.ns-1]} must contain only hanzi without any # punctuation. Likewise the strings {segs_ec[0..ns-1]} must contain # only EVA letters in [a-z?], without any EVA spaces or other markup. # # The SPS text {text_wc} must be the concatenation of the EVA strings # {segs_ec[0..ns-1]}, but split into words by space markers [.,-]. # # The the concatenation {macrotx_ch} of {segs_ch[0..ns-1]} must be a reduced version of the # concatenation {microtx_ch} of the strings {items_wh[0..nt-1]} with # punctuation removed. The difference may be some items of {E} that # are assumed to be omitted from the translation {P}. Thus the # procedure first excludes from {items_wh} any items that contains no # hanzi characters or have no counterpart in {segs_ch}. (Each # item must be either entirely preserved or entirely omitted). It also # excludes the corresponding items from {items_tg} and {items_en}. After this step # {macrotx_ch} and {microtx_ch} will be equal, and every item in # {items_wh} will contain some hanzi character. # # Let now {nt} be the number of items in the micro-parsing {items_wh} # of {E} after this cleanup. Let {ns} be the number of segments in # {segs_ch} and {segs_ec}, which must be odd, and let {ng,nh} be the # number of gaps and hits in the macro-parsing {segs_ch} (so that {ng # = nh+1} and {ns =ng+nh}. # # The procedure first splits the list {items_wh} into lists # {bites_wh_segs[0..ns-1]} whose elements are elements of {items-ch} # or parts thereof. # # Then the procedure splits {text_wc} into a macro-parsing {segs_wc} # such that each gap or hit string has the same EVA letters as its # counterpart in {segs_ec}. # # Then it splits each string {segs_wc[ks]} with odd {ks} into a # list {bites_wc_segs[ks]} of strings at word # separators, so that the number of pieces is the same as the number # of pieces in {bites_wh_segs[ks]}, and the number of EVA letters in # each piece is as proportional as possible to the number of hanzi in # the corresponding piece of {bites_wh_segs[ks]}. # # Then finally it joins the lists {bites_wc_segs[0.ns-1]} into a list # {bites_wc} of {nt} strings. Ditto for {bites_wh_segs} giving # {bites_wh}, {bites_tg_segs} giving {bites_tg}, and {bites_en_segs} # giving {bites_en}. # # The returned result is the compatibilized lists {bites_tg, bites_wh, bites_wc, bites_en}. debug = False ns = len(segs_ch) nh = ns//2; ng = nh + 1; assert ns == ng + nh assert len(segs_ec) == ns def join_segs_bites(bites_segs): # Given {ns} lists of bites {bites_segs[0..ns-1]}, # returns a single list that is the concatenation of those lists. assert len(bites_segs) == ns bites = [ seg for segs in bites_segs for seg in segs ] return bites # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: def copy_indentation_of_bites_from_bites_wh(bites_wh, bites_aa): # Copies indentations from {bites_wh} to {bites_aa}, # using one '··' for each ' '. nb = len(bites_wh); assert len(bites_aa) == nb for ib in range(nb): bites_aa[ib] = copy_bite_indentation(bites_wh[ib], bites_aa[ib]) return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: if debug: err.write(f"!$ ### input micro-parsing tg, wh, en ###\n") write_tg_wh_wc_en_micro_parsings(err, "!$", items_tg, items_wh, None, items_en) err.write(f"!$ ### input macro-parsing ch, ec ###\n") write_wh_ec_wc_macro_parsings(err, "!$", segs_ch, segs_ec, None) err.write(f"!$ ### input parag text wc ###\n") err.write(f"!$ {text_wc = }\n") if debug: err.write(f"!$ removing omitted items ...\n") items_wh, items_tg, items_en = remove_omitted_bencao_items \ (items_wh, items_tg, items_en, segs_ch) if debug: err.write(f"!$ after removal:\n") write_tg_wh_wc_en_micro_parsings(err, "!$", items_tg, items_wh, None, items_en) nt = len(items_wh) assert len(items_en) == nt if debug: err.write(f"### input segs: ###\n") write_wh_ec_wc_macro_parsings(err, "!$", segs_ch, segs_ec, None) if debug: err.write(f"!$ Splitting items per hanzi segs ...\n") bites_wh_segs, bites_tg_segs, bites_en_segs = \ split_micro_parsings_tg_wh_en_per_macro_parsing_ch \ ( items_wh, items_tg, items_en, segs_ch) if debug: err.write(f"!$ Split items:\n") write_tg_wh_wc_en_macro_micro_parsings \ ( err, '!$', bites_tg_segs, bites_wh_segs, None, bites_en_segs ) assert len(bites_tg_segs) == ns assert len(bites_wh_segs) == ns assert len(bites_en_segs) == ns if debug: err.write(f"!$ Splitting the puntuacted starps text as per macro EVA parsing ...\n") segs_wc = split_text_wc_per_macro_parsing_ec(text_wc, segs_ec) if debug: err.write(f"!$ split starps text:\n") write_wh_ec_wc_macro_parsings(err, "!$", None, segs_ec, segs_wc) assert len(segs_wc) == ns if debug: err.write(f"!$ Splitting the starps punctuated EVA segs as per items ...\n") bites_wc_segs = [] for ks in range(ns): if debug: err.write(f"!$ wc segment = {segs_wc[ks]!r} wh segment = {bites_wh_segs[ks]!r}\n") bts_wc = split_segment_wc_proportionally_to_micro_parsing_wh(segs_wc[ks], bites_wh_segs[ks]) if debug: err.write(f"!$ {bts_wc = !r}\n\n") assert len(bts_wc) == len(bites_wh_segs[ks]) bites_wc_segs.append(bts_wc) if debug: write_tg_wh_wc_en_macro_micro_parsings \ ( err, '!$', bites_tg_segs, bites_wh_segs, bites_wc_segs, bites_en_segs ) if debug: err.write(f"!$ Joining lists of bites ...\n") bites_tg = join_segs_bites(bites_tg_segs) bites_wh = join_segs_bites(bites_wh_segs) bites_wc = join_segs_bites(bites_wc_segs) bites_en = join_segs_bites(bites_en_segs) if debug: if debug: err.write(f"!$ joined lists:\n") write_tg_wh_wc_en_micro_parsings(err, '!$', bites_tg, bites_wh, bites_wc, bites_en) nb = len(bites_wh) assert len(bites_tg) == nb assert len(bites_wc) == nb assert len(bites_en) == nb if debug: err.write(f"!$ copying indentation ...\n") copy_indentation_of_bites_from_bites_wh(bites_wh, bites_wc) copy_indentation_of_bites_from_bites_wh(bites_wh, bites_en) if debug: if debug: err.write(f"!$ final items lists:\n") write_tg_wh_wc_en_micro_parsings(err, '!$', bites_tg, bites_wh, bites_wc, bites_en) return bites_tg, bites_wh, bites_wc, bites_en # ---------------------------------------------------------------------- def copy_bite_indentation(bwh, baa): # Copies indentations from string {bwh} to string {baa}, # using one '··' for each ' '. if baa == "": baa = "-" if baa != "-": kwh = 0 while kwh < len(bwh) and bwh[kwh] == ' ': baa = '··' + baa; kwh += 1 return baa # ---------------------------------------------------------------------- def remove_omitted_bencao_items(items_wh, items_tg, items_en, segs_ch): # Given lists {items_wh,items_tg,items_en} of {nt} strings and # a macro-partition {segs_ch} of a pure hanzi string, # removes from the former lists any entries that are omitted in the latter. # # Each string in {items_wh} must consist of hanzi characters and hanzi punctualtion. # The strings in the two other lists are arbitrary Unicode strings. # # The strings {segs_ch} must consiste of hanzi characters only. # # The concatenation {macrotx_ch} of {segs_ch} must be a # subsequence of the concatenation {microtx_ch} of {ìtems_wh} with punctuation removed. # # For each {it}, checks whether the string {bch[it]} of hanzi in # {items_wh[it]}, with punctuation removed, is present or missing at # the right place of {macrotx_ch}. If not, removes element {it} from # the three lists {items_wh, items_tg, items_en}. Assumes that # either the whole string {bch[it]} is present or the whole string is absent. # # Returns the new versions of {items_wh,items_tg,items_en} debug = False nt = len(items_wh); assert len(items_en) == nt text_ch = "".join(segs_ch) text_ch = re.sub(r"[,。;:[]() ~]", "", text_ch) if debug: err.write(f"!! {text_ch = !r}\n") items_wh_new = [] items_tg_new = [] items_en_new = [] kch = 0; # Finger into {text-ch}. for it in range(nt): item_wh = items_wh[it] item_ch = re.sub(r"[,。;:[]() ~]", "", item_wh) mch = len(item_ch) tbit_ch = text_ch[kch:kch+mch] if debug: err.write(f"!! {item_wh = !r} {item_ch = !r} {kch = } {tbit_ch = !r}\n") if item_ch == tbit_ch: items_wh_new.append(item_wh); items_tg_new.append(items_tg[it]); items_en_new.append(items_en[it]) kch += mch if debug: err.write(f"!! appended {kch = }\n") else: # omit the item: if debug: err.write(f"!! omitted {kch = }\n") pass if kch != len(text_ch): err.write(f"!! {items_wh_new = !r}\n") err.write(f"!! leftover = {text_ch[kch:]}\n") assert False, "segs_ch not contained in items_wh" return items_wh_new, items_tg_new, items_en_new # ---------------------------------------------------------------------- def split_micro_parsings_tg_wh_en_per_macro_parsing_ch(items_wh, items_tg, items_en, segs_ch): # Parameters: # # {items_wh} a micro-parsing of a punctuated SBJ entry {E} into # items suitable for a column of an aligned table # # {items_tg} a list of tags for the items in {items_wh}, # # {items_en} a list of English translations for the items of {items_wh} # # {segs_ch} a macro-parsing of the pure hanzi text of {E} into alternating # gaps and hits # # Let {nh = ns//2} be the number of hits and {ng=nh+1} the number of gaps in {segs_ch}. # Let {nt} be the number of items in {items_wh}, which must be the same for {items_tg} # and {items_en}. # # Each string {items_wh[0..nt-1]} must consist of hanzi characters # possibly interspersed with ideographic punctuation only. # # The strings {items_tg[0..nt-1]} and {items_en[0..nt-1]} are arbitrary # ASCII strings. # # Each element {segs_ch[0..ns-1} must be a string of pure hanzi # characters, without ideographic punctuation or other characters. # # The concatenation of {segs_ch[0..ns-1} must be a string of pure hanzi characters # (without punctuation) that is equal to the concatenation of all # items {item_wh[0..nt-1]} with all the punctuation removed. # # The procedure splits the list of strings {items_wh} into {ns} lists # {bites_wh_segs[0..ns-1]} such that each list {bites_wh_segs[ks]} # contains the same hanzi as {segs_ch[ks]}. # # For that it may have to breaks some items of {items_wh} into two or # more /bites/ (substrings). Whenever it does so, it marks the split # with a '~' on both sides of the cut, and replicates the indentation # of the original item (realized as zero or more ideographic spaces ' #  ') onto the second bite. # # The procedure also splits the items of {items_tg} into separate # lists {bites_tg_segs[0..ns-1]} that are paralell to {bites_wh_segs}. # It similarly splits the items of {items_en} into lists # {bites_en_segs[0..ns-1]}. These bites receive the same indentation # as those of {bites_wh_segs}, but using '··' instead of ' '. # # If {segs_ch[ks]} is an empty string, the corresponding list # {bites_wh_segs[ks]} will have a single element, the empty string; # and ditto for {bites_tg_segs[ks]} and {bites_en_segs[ks]}. # # The procedure returns the three lists of lists # {bites_wh_segs[0..ns-1]}, {bites_tg_segs[0..ns-1]}, # {bites_en_segs[0..ns-1]}. # # If a string of {bites_wh_segs[ks]} is only part of an element of # {items_wh}, that fact is indicated by an appended or prepended # "hyphenation mark" '~'. In that case, any leading ' 's in the part # before the split are duplicated on the part after the split. debug = False ns = len(segs_ch); nh = ns//2; ng = nh+1; assert ns == ng + nh nt = len(items_wh) assert len(items_tg) == nt assert len(items_en) == nt punct_wh = set(r",。;: ~") brack_wh = set(r"[]()") hyphen_wh = '~' bites_wh_segs = [] bites_tg_segs = [] bites_en_segs = [] # Leftover bits of prev item: rest_tg = "" rest_wh = "" rest_en = "" it = 0 # Next unused item is {items_wh[it]}. for ks in range(ns): # Collect lists {bts_wh,bts_tg,bts_en} of bits of items to match {segs_ch[ks]}: bts_wh, rest_wh, \ bts_tg, rest_tg, \ bts_en, rest_en, \ it = gobble_up_bites_of_items \ ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en, it, punct_wh, brack_wh, segs_ch[ks], hyphen_wh ) bites_wh_segs.append(bts_wh) bites_tg_segs.append(bts_tg) bites_en_segs.append(bts_en) def copy_indent_at_split(bits, kb, hyphen): # Takes a list of lists of strings {bits}. # If {bits[kb][-1][-1]} and {bits[kb + 1][0][0]} are {hiphen}, # copies the indentation of {bits[kb][-1]} onto {bits[kb + 1][0]}. if debug: err.write(f"!# {bits = !r} {kb = }\n") nb = len(bits) if kb < 0 or kb+1 >= nb: return bs0 = bits[kb]; bs1 = bits[kb + 1]; if debug: err.write(f"!# {bs0 = !r}\n") err.write(f"!# {bs1 = !r}\n") assert isinstance(bs0, list) and isinstance(bs1, list) m0 = len(bs0); m1 = len(bs1); if m0 == 0 or m1 == 0: return b0 = bs0[-1]; b1 = bs1[0] if debug: err.write(f"!# {b0 = !r}\n") err.write(f"!# {b1 = !r}\n") assert isinstance(b0, str) and isinstance(b1, str) if b0 == "" or b1 == "": return split = (b0[-1] == hyphen) assert split == (b1[0] == hyphen) if not split: return k = 0; while k < len(b0) and b0[k] == ' ': b1 = ' ' + b1; k += 1 bs1[0] = b1 if debug: err.write(f"!# {b1 = !r}\n") err.write(f"!# new {bs1 = !r}\n") err.write(f"!# new {bits = !r}\n") err.write(f"!# ----------------------------------------\n") return # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: # Fix indentation after splits: for ks in range(ns): copy_indent_at_split(bites_wh_segs, ks, hyphen_wh) assert rest_wh == "" and it == nt return bites_wh_segs, bites_tg_segs, bites_en_segs # ---------------------------------------------------------------------- def split_text_wc_per_macro_parsing_ec(text_wc, segs_ec): # Given the text {text_wc} of an SPS parag {P} with word # separators, and a macro-parsing {segs_ec[0..ns-1]} of the EVA letters # of {P} into segments, splits {text_wc} into # strings {segs_wc[0..ns-1]} with the same EVA letters. # # Let {nh = ns//2} be the number of hits in the macro-parsing # {segs_ec}, and {ng = nh+1} the number of gaps in it. Each element # {segs_ec[0..ns-1} must be a strings of pure EVA letters [a-z?], # without word spaces or other characters. # # The string {text_wc} must consist of EVA letters [a-z?] possibly # interspersed with EVA punctuation only. # # The procedure returns a partition of {text_wc} into segments # {segs_wc[0..ns-1]} possibly with hyphenation marks (see below). The # concatenation of all the strings of {segs_wc[0..ns-1]}, minus the # punctuation characters, will be equal to the concatenation of all # strings {segs_ec[0..ns-1]}. # # If a string {segs_wc[ks]} begins or ends with only part of a word # of {text_wc}, that fact is indicated by an appended or prepended # "hyphenation mark" '~'. debug = False ns = len(segs_ec) nh = ns//2; ng = nh + 1; assert nh + ng == ns # Just in case: text_wc = re.sub(r"<[a-z0-9.]+>", "", text_wc) text_wc = re.sub(r"[ <$%>]", "", text_wc) # Normalize word separators: text_wc = re.sub(r"^[,.-]", "", text_wc) text_wc = re.sub(r"[,.-]$", "", text_wc) text_wc = re.sub(r"[,.-]", ".", text_wc) # Count total EVA letters{nec} in {segs_ec}: nec = 0 for sg_ec in segs_ec: nec += len(sg_ec) segs_wc = [] rest_wc = text_wc # Leftover bit of {text_wc} kec = 0 # Index of next unmatched char in {segs_ec}. def grab_chars(tec, aggressive): nonlocal rest_wc, kec rest_ec = tec twc = "" while True: if rest_wc == "": assert rest_ec == "", f"unmatched gap/hit chars {rest_ec = !r}" break cwc = rest_wc[0]; if cwc == '.': if rest_ec == "" and not aggressive: break if debug: err.write(f"!: {twc = !r} {rest_wc = !r}\n") else: if rest_ec == "": break cec = rest_ec[0]; rest_ec = rest_ec[1:] if debug: err.write(f"!: {cwc = !r} {cec = !r}\n") assert cwc == cec, f"wc/ec mismatch" twc += cwc; rest_wc = rest_wc[1:] if debug: err.write(f"!: {twc = !r} {rest_wc = !r}\n") kec += len(tec) return twc # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: for ks in range(ns): # Collect chars from {rest_wc} to match {segs_ec[ks]}: aggressive = (ks % 2 == 0) sg = grab_chars(segs_ec[ks], aggressive) if debug: err.write(f"!: {sg = !r} {rest = !r}\n") segs_wc.append(sg) assert rest_wc == "" and kec == nec # Add "hyphens" '~' between split words: for ks in range(ns-1): # Check gap between segs {ks} and {ks+1}: js = ks + 1 char1 = '.' if segs_wc[ks] == "" else segs_wc[ks][-1] char2 = '.' if segs_wc[js] == "" else segs_wc[js][0] if char1 != '.' and char2 != '.': segs_wc[ks] = segs_wc[ks] + '~' segs_wc[js] = '~' + segs_wc[js] return segs_wc # ---------------------------------------------------------------------- def gobble_up_bites_of_items \ ( rest_aa, items_aa, rest_bb, items_bb, rest_cc, items_cc, \ it, punct, brack, pure_aa, hyphen ): # Scans the string {rest_aa} and the strings {items_aa[it:]} building a # list {bites_aa} of items or bits of items until collecting all the # characters in the string {pure_aa}, and any punctuation characters # that may be interspersed with them. # # Characters in {rest_aa} and {items_aa[it:]} that do not match the # string {pure_aa} must be in the set {punct}(punctuation) or # or {brack} (parenthesis-like delimiters) # # Keeps gobbling up characters that are in {punct} (but not {brack} even # after it collects all the characters in {pure_aa}. # # If the first bit does not start at an item boundary, prepends # {hyphen} to it. If the last bit does not end at an item boundary, # appends {hyphen} to it. # # If {rest_bb} and {items_bb} are not {None}, also gobbles up a list # {bites_bb} of items and bits of items from them so as to match # {bites_aa}. If these parameters are {None}, sets {bits_bb} to # {none}. Ditto for {rest_cc} and {items_cc}, yielding {bites_cc}. # # If a list {bites_aa,bites_bb,bites_cc} would be empty, # sets it to a singleton list with an empty string instead. # # Returns {bites_aa, rest_aa, bites_bb, rest_bb, bites_cc, rest_cc, it} # where {rest_aa, rest_bb, rest_cc, it} are the updated values of # those input parameters. debug = False def get_next_character_from_items_aa(): nonlocal rest_aa, items_aa, it # Gets the next character {char_aa} from the string {rest_aa} or # from {items_aa[it]}. Returns {char_aa} and the new values for # {rest_aa} and {it} (but does not update them) if rest_aa == "": assert it < nt, f"ran out of items_aa after {kp} of {np} chars" rest1_aa = items_aa[it] it1 = it + 1 else: rest1_aa = rest_aa it1 = it assert isinstance(rest1_aa, str) and rest1_aa != "" char_aa = rest1_aa[0]; rest1_aa = rest1_aa[1:]; return char_aa, rest1_aa, it1 # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: if debug: err.write(f"!+ {rest_aa = !r} {items_aa = !r}\n") err.write(f"!+ {rest_bb = !r} {items_bb = !r}\n") err.write(f"!+ {rest_aa = !r} {items_aa = !r}\n") err.write(f"!+ {it = }\n") err.write(f"!+ {pure_aa = !r}\n") nt = len(items_aa) np = len(pure_aa) # Items of {items_aa, items_bb, items_cc} and chunks thereof that match {pure_aa}: bites_aa = [] bites_bb = None if items_bb == None else [] bites_cc = None if items_cc == None else [] # Next incomplete strings to add to {bites_aa, bites_bb, bites_cc}. bt_aa = "" bt_bb = None if items_bb == None else "" bt_cc = None if items_cc == None else "" kp = 0 # Next character of {pure_aa} to be matched. if rest_aa != "": bt_aa = hyphen while rest_aa != "" or it < nt: if debug: err.write(f"!+ -- iteration --\n") if debug: err.write(f"!+ {rest_aa = !r} {it = } {rest_bb = !r} {rest_cc = !r} {it = }\n") if debug: err.write(f"!+ {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n") if rest_aa == "" and bt_aa != "": # Item break, break {bites_aa, bites_bb, bites_cc} too: if debug: err.write(f"!+ item break ...\n") bites_aa.append(bt_aa); bt_aa = "" if items_bb != None: bites_bb.append(bt_bb); bt_bb = "" if items_cc != None: bites_cc.append(bt_cc); bt_cc = "" if debug: err.write(f"!+ {bites_aa = !r} {bites_bb = !r} {bites_cc = !r}\n") if debug: err.write(f"!+ {rest_aa = !r} {it = } {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n") if debug: err.write(f"!+ getting next char ...\n") char_aa, rest1_aa, it1 = get_next_character_from_items_aa() if debug: err.write(f"!+ {char_aa = !r} {rest1_aa = !r} {it1 = }\n") gobble = kp < np or char_aa in brack or (char_aa in punct and bt_aa != "") if debug: err.write(f"!+ {gobble = :1b} {kp = } pure_aa[kp] = { pure_aa[kp] if kp < np else '' !r}\n") if gobble: # Gobble {char_aa}: bt_aa = bt_aa + char_aa; rest_aa = rest1_aa; if char_aa not in punct and char_aa not in brack: pchar = pure_aa[kp] assert kp < np and char_aa == pchar, f"unexpected char_aa {char_aa} != {pchar} at {kp}" kp += 1 if items_bb != None and bt_bb == "": bt_bb = rest_bb; rest_bb = "" if it1 == it else items_bb[it] if items_cc != None and bt_cc == "": bt_cc = rest_cc; rest_cc = "" if it1 == it else items_cc[it] it = it1 if debug: err.write(f"!+ {rest_aa = !r} {rest_bb = !r} {rest_cc = !r} {it = }\n") if debug: err.write(f"!+ {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n") else: # Stop here, leave {char_aa} for the next hit: if rest_aa != "": bt_aa = bt_aa + hyphen; break if debug: err.write(f"!+ {kp = } {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n") if debug: err.write(f"!+ {bt_aa = !r} {rest_aa = !r} {it = }\n") if debug: err.write(f"!+ -- end loop --\n") if bt_aa != '' or bt_bb != '' or bt_cc != '' or len(bites_aa) == 0: bites_aa.append(bt_aa) if items_bb != None: bites_bb.append(bt_bb) if items_cc != None: bites_cc.append(bt_cc) return bites_aa, rest_aa, bites_bb, rest_bb, bites_cc, rest_cc, it # ---------------------------------------------------------------------- def split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh): # Takes a fragment {text_wc} of EVA text with words separated by EVA # word spaces [.,-], and a list {bites_wh[0..nb-1]} of strings with # hanzi characters and hanzi punctuation. Splits {text_wc} into a list # {bites_wc[0..nb-1]} of substrings at word boundaries, so that, for # each {ib} the total EVA letters in {bites_wc[0..ib]}, not counting # punctuation, is approximately proportional to the number of hanzin # in {bites_wh[0..ib]}, not counting hanzi punctuation. debug = False # debug = (text_wc == "apo") nb = len(bites_wh) text_ec = re.sub(r"[ .,~-]", "", text_wc); nec = len(text_ec) text_ch = re.sub(r"[,。;:[]() ~]", "", "".join(bites_wh)); nch = len(text_ch) ec_per_ch = (nec + 0.000001)/(nch + 0.000001) def get_next_ec_token(rest_wc): # Splits off the next non-empty EVA token from {rest_wc}. # The token will have only EVA letters [a-z?] or '~'. # Returns the token and the remaining {rest_wc}. # Returns "" if there is none such. tok_ec = "" while rest_wc != "": m = re.fullmatch(r"([a-z?~]*)[.,-]+(.*)", rest_wc) if m == None: tok_ec = re.sub(r"[.,-]", "", rest_wc); rest_wc = "" else: tok_ec = m.group(1); rest_wc = m.group(2) if tok_ec != "": break return tok_ec, rest_wc # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: rest_wc = text_wc def gobble_wc_tokens(tch, tec): # Take EVA words from {rest_wc} until best equalizing the {tec} count # to the equivalent of {tch}. # Returns a bite of {rest_wc} that is equivalent to {bch}. # in terms of EVA letter counts vs. hanzi counts. # Updates {rest_wc}. nonlocal rest_wc bwc = "" # next bite for {bites_wc}. mec = 0 # Count of EVA letters in {bwc}. tec_est = tch * ec_per_ch while True: if debug: err.write(f"!¤ --- inner loop ---\n") err.write(f"!¤ {rest_wc = !r}\n") err.write(f"!¤ {bwc = !r} {mec = }\n") if debug: err.write(f"!¤ getting next token from wc text:\n") tok_ec, rest_wc = get_next_ec_token(rest_wc) if debug: err.write(f"!¤ {tok_ec = !r} {rest_wc = !r}\n") if tok_ec == "": assert rest_wc == ""; break else: # Shall we take {tok_ec} or end {bwc} here? rec = len(re.sub(r"^~", "", tok_ec)) rec = len(re.sub(r"~$", "", tok_ec)) tec0 = tec + mec # Next value of {tec} if we don't take this token. tec1 = tec + mec + rec # Next value of {tec} if we take this token. take = abs(tec1 - tec_est) <= abs(tec0 - tec_est) if debug: err.write(f"!¤ {tec = } {mec = } {rec = }\n") if debug: err.write(f"!¤ {tec_est = } {tec0 = } {tec1 = } {take = :b}\n") if take: # Add {tok_ec} to current bite: if bwc != "": bwc += '.' bwc += tok_ec; mec += rec else: # Put {tok_ec} back and exit inner loop. if rest_wc != "": rest_wc = '.' + rest_wc rest_wc = tok_ec + rest_wc; break return bwc, mec # .................................................................... bites_wc = [] totct_ch = 0 # Count of hanzi so far in {bites_wh}. totct_ec = 0 # Count of EVA chars so far in {bites_wc}. for ib in range(nb): # Count hanzi in {bites_wh[ib]}: bwh = bites_wh[ib] bch = re.sub(r"[,。;:[]() ~]", "", bwh) mch = len(bch) if debug: err.write(f"!¤ @@@ outer loop @@@\n") err.write(f"!¤ {totct_ch = } {bites_wc = !r}\n") err.write(f"!¤ {rest_wc = !r}\n") err.write(f"!¤ {bwh = !r} {bch = !r} {mch = }\n") totct_ch = totct_ch + mch bwc, mec = gobble_wc_tokens(totct_ch, totct_ec) bites_wc.append(bwc); totct_ec += mec if rest_wc != "": # Force gobbling of any final leftover: bwc = "" if len(bites_wc) == 0 else bites_wc[-1] bwc += rest_wc; if len(bites_wc) == 0: bites_wc.append(bwc) else: bites_wc[-1] = bwc rec = re.sub(r"[.,-~]", "", rest_wc) totct_ec += len(rec) # Replace empty bites by "-" or " for ib in range(len(bites_wc)): if bites_wc[ib] == "": bites_wc[ib] = "-" return bites_wc # ---------------------------------------------------------------------- def write_wh_ec_wc_macro_parsings(wr, lab, segs_wh, segs_ec, segs_wc): # Writes three macro-parsings {segs_wh,segs_ec,segs_wc} as the two # columns of a table. Assumes that the elements of {segs_wh} are hanzi strings # possibly with ideographic punctuation, while those of {segs_wc} are # Latin or pinyin strings in Unicode. wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") mpars = ( \ ( segs_wh, "?", ' ', ), ( segs_ec, "??", ' ', ), ( segs_wc, "??", ' ', ), ) wfn.write_parsings(wr, lab, True, mpars) wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") return # ---------------------------------------------------------------------- def write_tg_wh_wc_en_micro_parsings(wr, lab, bites_tg, bites_wh, bites_wc, bites_en): # Writes four micro-parsings {ist_tg,bites_wh,bites_wc,bites_en} as the four # columns of a table. Assumes that the elements of {bites_wh} are hanzi strings # possibly with ideographic punctuation, while the others are # Latin or pinyin strings in Unicode. wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") mpars = ( \ ( bites_tg, "??", ' ' ), ( bites_wh, "?", ' ' ), ( bites_wc, "??", ' ' ), ( bites_en, "??", ' ' ), ) wfn.write_parsings(wr, lab, False, mpars) wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") return # ---------------------------------------------------------------------- def write_tg_wh_wc_en_macro_micro_parsings \ ( wr, lab, segs_bites_tg, segs_bites_wh, segs_bites_wc, segs_bites_en ): wr.write(f"{lab} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") mpars = ( \ ( segs_bites_tg, "", ' ', ), ( segs_bites_wh, "?", ' ', ), ( segs_bites_wc, "??", ' ', ), ( segs_bites_en, "??", ' ', ), ) wfn.write_bilevel_parsings(wr, lab, mpars) wr.write(f"{lab} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") return # ---------------------------------------------------------------------- def test_stuff(): err.write("TESTING\n") err.write("----------------------------------------\n") test_split_segment_wc_proportionally_to_micro_parsing_wh() # err.write("----------------------------------------\n") # test_align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec() # err.write("----------------------------------------\n") # test_split_text_wc_per_macro_parsing_ec() # err.write("----------------------------------------\n") # test_gobble_up_bites_of_items() # err.write("----------------------------------------\n") # test_split_micro_parsings_tg_wh_en_per_macro_parsing_ch() err.write("----------------------------------------\n") return # ---------------------------------------------------------------------- def test_split_segment_wc_proportionally_to_micro_parsing_wh(): text1_wc = 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo' bites1_wh = [ '  补虚,     ', '  杀毒,     ', '   辟~     ' ] exp_bites1_wc = [ 'rair.apy.okeey', 'qokaiin.or.aiir.al', 'dal.sheeo', ] do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text1_wc, bites1_wh, exp_bites1_wc) text2_wc = 'chedy.qokeed.okain.chdy.laiin.ofar.chedy.tedam.okeedy.lkal.daiin.yk~' bites2_wh = [ '风寒湿痹, ', '  乳难, ', '  消水,', '  养五脏,  ', '  益~' ] exp_bites2_wc = [ 'chedy.qokeed.okain.chdy', 'laiin.ofar', 'chedy.tedam', 'okeedy.lkal', 'daiin.yk~' ] do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text2_wc, bites2_wh, exp_bites2_wc) return # ---------------------------------------------------------------------- def do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh, exp_bites_wc): err.write("=== testing split_segment_wc_proportionally_to_micro_parsing_wh ===\n") err.write(f"### input items ###\n") err.write(f"!% {text_wc = !r}\n") err.write(f"!% {bites_wh = !r}\n") err.write(f"!% splitting words proportionally to hanzi bites...\n") bites_wc = split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh) err.write(f"!% split words:\n") err.write(f"!% {bites_wc = !r}\n") text_wc = re.sub(r"[,.-][,.-]+", ".", text_wc) text_wc = re.sub(r"^[,.-]+", "", text_wc) text_wc = re.sub(r"[,.-]+$", "", text_wc) tchk_wc = ".".join(bites_wc) + '.' tchk_wc = re.sub(r"[,.-][,.-]+", ".", tchk_wc) tchk_wc = re.sub(r"^[,.-]+", "", tchk_wc) tchk_wc = re.sub(r"[,.-]+$", "", tchk_wc) err.write(f"!% text_wc = {text_wc!r}\n") err.write(f"!% tchk_wc = {tchk_wc!r}\n") assert text_wc == tchk_wc assert bites_wc == exp_bites_wc return # ---------------------------------------------------------------------- def test_align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec(): # err.write("=== testing align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec ===\n") items_wh_en = ( \ ( '(A) ', '丹雄鸡    ', 'Red male chicken', ), ( '(A1) ', ' [味]   ', '[Flavor]', ), ( '(A11) ', '  甘,   ', 'sweet,', ), ( '(A12) ', '  微温。  ', 'slightly warm.', ), ( '(A3) ', ' [主治]  ', '[Main uses]', ), ( '(A31) ', '  (女子) ', '(Women)', ), ( '(A311) ', '   赤白沃;', 'red and white discharge;', ), ( '(A32) ', '  补虚,  ', 'bamboo hair,', ), ( '(A33) ', '  杀毒,  ', 'kills bears,', ), ( '(A34) ', '  (女子) ', '(women)', ), ( '(A341) ', '   辟不祥。', 'cannot knit socks.', ), ( '(B) ', '头:     ', 'Head:', ), ( '(B3) ', ' [主]   ', '[Mainly for]', ), ( '(B31) ', '  杀鬼。  ', 'killing time.', ), ( '(C) ', '肪:     ', 'Transistors:', ), ) items_tg = [ tri[0] for tri in items_wh_en ] items_wh = [ tri[1] for tri in items_wh_en ] items_en = [ tri[2] for tri in items_wh_en ] nt = len(items_wh) err.write(f"### input items ###\n") write_tg_wh_wc_en_micro_parsings(err, '!!', items_tg, items_wh, None, items_en) segs_ch = [ '丹雄鸡', '主治', '赤白', '沃', '补虚杀毒辟不祥头', '主', '杀鬼肪', ] text_wc = 'poar.keeo.daiin.qotedair.apo.rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo.daiin.chsd.qokeeey' segs_ec = [ 'poar.keeo', 'daiin', 'qotedair', 'apo', 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo', 'daiin', 'chsd.qokeeey' ] segs_ec = [ re.sub(r"[,.~-]", "", sg) for sg in segs_ec ] err.write(f"### input segs ###\n") write_wh_ec_wc_macro_parsings(err, '!!', segs_ch, segs_ec, None) err.write(f"!! splitting and aligning...\n") bites_tg, bites_wh, bites_wc, bites_en = \ align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \ ( items_tg, items_wh, items_en, segs_ch, text_wc, segs_ec) write_tg_wh_wc_en_micro_parsings(err, '!!', bites_tg, bites_wh, bites_wc, bites_en) nb = len(bites_tg) assert nb == len(bites_wh) assert nb == len(bites_wc) assert nb == len(bites_en) # html_tb = h77.make_three_column_entry_table(st, tags1, bites_wh, bites_wc, bites_en) return def test_split_micro_parsings_tg_wh_en_per_macro_parsing_ch(): err.write("=== testing split_micro_parsings_tg_wh_en_per_macro_parsing_ch ===\n") items_tg_wh_en = ( \ ( '(A)', '丹雄鸡    ', 'Red rooster:', ), ( '(A1)', ' [味]   ', '[Flavor]', ), ( '(A11)', '  甘,   ', 'sweet,', ), ( '(A12)', '  微温。  ', 'a bit fuzzy.', ), ( '(A3)', ' [主治]  ', '[Main uses]', ), ( '(A31)', '  (女子) ', '(Women)', ), ( '(A311)', '   赤白沃。', 'lipstick.', ), ( '(A32)', '  补虚,  ', 'sawing wood,', ), ( '(A33)', '  杀毒,  ', 'fixing cars,', ), ( '(A34)', '  辟不祥。 ', 'washing floors.', ), ( '(B)', '头:     ', 'Head:', ), ( '(B3)', ' [主]   ', '[Mainly for]', ), ( '(B31)', '  杀鬼。  ', 'killing time.', ), ( '(C)', '肪:     ', 'Pipes:', ), ) items_tg = [ tri[0] for tri in items_tg_wh_en ] items_wh = [ tri[1] for tri in items_tg_wh_en ] items_en = [ tri[2] for tri in items_tg_wh_en ] err.write(f"### input micro-parsings ###\n") write_tg_wh_wc_en_micro_parsings(err, '!=', items_tg, items_wh, None, items_en) segs_ch = [ '丹雄鸡味甘微温', '主治', '女子赤白', '沃', '补虚杀毒辟', '不祥头主', '杀鬼肪', ] err.write(f"### input macro-parsing of hanzi ###\n") write_wh_ec_wc_macro_parsings(err, '!=', segs_ch, None, None) # Check the data: chicktx_ch = "".join(segs_ch) err.write(f"{chicktx_ch = !r}\n") chocktx_ch = '丹雄鸡味甘微温主治女子赤白沃补虚杀毒辟不祥头主杀鬼肪' assert chocktx_ch == chicktx_ch ns = len(segs_ch) nh = ns//2; ng = nh+1; assert ns == ng + nh segs_bites_wh, segs_bites_tg, segs_bites_en = \ split_micro_parsings_tg_wh_en_per_macro_parsing_ch \ (items_wh, items_tg, items_en, segs_ch) write_tg_wh_wc_en_macro_micro_parsings \ ( err, '!=', segs_bites_tg, segs_bites_wh, None, segs_bites_en ) # Check for preservation of hanzi: for ks in range(ns): sg_ch = segs_ch[ks] ck_ch = re.sub(r"[,。;:[]() ~]", "", "".join(segs_bites_wh[ks])) err.write(f"!= {sg_ch = !r} {ck_ch = !r}\n") assert sg_ch == ck_ch return # ---------------------------------------------------------------------- def test_split_text_wc_per_macro_parsing_ec(): err.write("=== testing split_text_wc_per_macro_parsing_ec ===\n") text_wc = 'poar.keeo.daiin.qotedair.apo.rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo.daiin.chsd.qokeeey' segs_ec = [ 'poar.keeo', 'daiin', 'qotedair', 'apo', 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo', 'daiin', 'chsd.qokeeey' ] segs_ec = [ re.sub(r"[,.~-]", "", sg) for sg in segs_ec ] ns = len(segs_ec) nh = ns//2; ng = nh+1; assert ns == ng + nh segs_wc = split_text_wc_per_macro_parsing_ec(text_wc, segs_ec) write_wh_ec_wc_macro_parsings(err, "!$", None, None, segs_wc) assert len(segs_wc) == ns chicktx_wc = re.sub(r"[.,-]", "", text_wc) err.write(f"{chicktx_wc = }\n") chocktx_wc = re.sub(r"[.~]", "", "".join(segs_wc)) err.write(f"{chocktx_wc = }\n") assert chicktx_wc == chocktx_wc return # ---------------------------------------------------------------------- def test_gobble_up_bites_of_items(): err.write("=== testing gobble_up_bites_of_items ===\n") items_tg_wh_en = ( \ ( '(A)', '丹雄鸡:   ', 'Red rooster:', ), ( '(A1)', ' [味]   ', '[Flavor]', ), ( '(A11)', '  甘,   ', 'sweet,', ), ( '(A12)', '  微温。  ', 'a bit fuzzy.', ), ( '(A3)', ' [主治]  ', '[Main uses]', ), ( '(A31)', '  (女子) ', '(Women)', ), ( '(A311)', '   赤白沃。', 'lipstick.', ), ( '(A32)', '  补虚,  ', 'sawing wood,', ), ( '(A33)', '  杀毒,  ', 'fixing cars,', ), ( '(A34)', '  辟不祥。 ', 'washing floors.', ), ( '(B)', '头:     ', 'Head:', ), ( '(B3)', ' [主]   ', '[Mainly for]', ), ( '(B31)', '  杀鬼。  ', 'killing time.', ), ( '(C)', '肪:     ', 'Pipes:', ), ) items_tg = [ tri[0] for tri in items_tg_wh_en ] items_wh = [ tri[1] for tri in items_tg_wh_en ] items_en = [ tri[2] for tri in items_tg_wh_en ] err.write(f"### input items ###\n") write_tg_wh_wc_en_micro_parsings(err, '!=', items_tg, items_wh, None, items_en) nt = len(items_wh) punct_wh = set(r",。;: ~") brack_wh = set(r"[]()") hyphen_wh = '~' err.write(f"============================================================\n") pure_wh_1 = "雄丹鸡味甘微" rest_wh = '雄丹鸡' rest_tg = '(AH)' rest_en = 'Crimson rooster' it = 1 bites_wh, rest_wh, \ bites_tg, rest_tg, \ bites_en, rest_en, \ it = gobble_up_bites_of_items \ ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en, it, punct_wh, brack_wh, pure_wh_1, hyphen_wh ) err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n") err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n") err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n") assert bites_wh == [ '~雄丹鸡', ' [味]   ', '  甘,   ', '  微~', ] assert bites_tg == [ '(AH)', '(A1)', '(A11)', '(A12)', ] assert bites_en == [ 'Crimson rooster', '[Flavor]', 'sweet,', 'a bit fuzzy.', ] assert rest_wh == "温。  " and it == 4 err.write(f"============================================================\n") pure_wh_2 = "主治" rest_wh = '' rest_tg = '' rest_en = '' it = 4 bites_wh, rest_wh, \ bites_tg, rest_tg, \ bites_en, rest_en, \ it = gobble_up_bites_of_items \ ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en, it, punct_wh, brack_wh, pure_wh_2, hyphen_wh ) err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n") err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n") err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n") assert bites_wh == [ ' [主治]  ', ] assert bites_tg == [ '(A3)', ] assert bites_en == [ '[Main uses]', ] assert (rest_wh == "(女子) " and it == 6) or (rest_wh == "" and it == 5) err.write(f"============================================================\n") pure_wh_3 = "杀鬼肪" rest_wh = '   ' rest_tg = '' rest_en = '' it = 12 bites_wh, rest_wh, \ bites_tg, rest_tg, \ bites_en, rest_en, \ it = gobble_up_bites_of_items \ ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en, it, punct_wh, brack_wh, pure_wh_3, hyphen_wh ) err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n") assert rest_wh == "" and it == 14 err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n") err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n") return # ---------------------------------------------------------------------- if len(sys.argv) == 2 and sys.argv[1] == "ALBSI.TEST": test_stuff()