# Last edited on 2026-02-20 14:29:52 by stolfi # Outputs a list of all tuples {tsize} consecutive words, # ignoring those that contain words that contain '*'. # # For each tuple of {tsize} consecutive words in the same line of the input, # writes {tsize+1} lines in the output with the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}" # # where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input # words, separated by '.'. The string {MIDDLE} has at least one word, but # {LEFT} and {RIGHT} may be empty. if book == "bencao": assert sub == "fu" enc = "chu" if unit == "ch" else "pys" if unit == "ps" else None elif book == "starps": assert sub == "fu" or sub == "gd" enc = "eva" else: assert False, f"bad {book = }" assert enc is not None, f"bad combo {book = } {unit = }" m = re.match(pat_punc, text) if enc == "utf": # Cleanup consists in deleting the Chinese punctuation: for ch in text: if debug: err.write(f"!! ch = '{ch}'") if re.fullmatch(ch, pat_punc): num_ignored += 1 if debug: err.write(" KO") else: good_chars.append(ch) if debug: err.write(" OK") if debug: err.write("\n") text = "".join(good_chars) elif enc == "eva": # Cleanup consists of deleting parag markers and ensuring simple EVA. if unit == "ec": # Remove all EVA punctuation: tlen = len(text) text = re.sub(r"[-,.]", "", text) num_ignored += tlen - len(text) elif unit == "wc": # Normalize all punc to single '.': tlen = len(text) text = re.sub(r"[-,]", ".", text) # Normalize punctuation: text = re.sub(r"[.][.]+", ".", text) text = re.sub(r"^[.]+", "", text) text = re.sub(r"[.]+$", "", text) num_ignored += tlen - len(text) else: assert False, f"invalid combo {enc = } {unit = }" ???charset = None # Sets of special hanzi characters (punct, blank, etc.) if unit == "ch": ???charset = dict() set_dir = "langbank/chin" ???charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl") ???charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl") ???charset['symbol'] = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl") ???charset['punct'] = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl") ???charset['blank'] = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl") # Read tables of chinese character sets: pat_line = None # Matches a pinyin line, with groups {LOC} and {TEXT}. pat_punc = None # Matches pinyin punctuation (excluding blanks). pat_word = None # Matches a pinyin word (syllable or compound). if unit == "ch": elif unit == "ps" or unit == "pj": # Patterns for parsing pinyin: pat_loc = r"b[1-3][.][1-6][.][0-9][0-9][0-9]" pat_line = f"<({pat_loc})>[ ]+(.*)\n" pat_punc = r"[.,;()*]" else: assert False pat_sec = r"s[0-2]" # Section s-number, "s0" to "s2". pat_sub = r"[.][0-9]" # Subsection number, 0 to 9, with '.'. pat_lseq = r"[.][0-9][0-9][0-9]" # , with '.'. pat_locid = f"<({pat_sec})({pat_sub})({pat_lseq})>" # Is a data line: if m.lastindex != 4: prog_error("num fields = %d" % m.lastindex) sec = m.group(1) sub = m.group(2) lseq = m.group(3) text = m.group(4).strip() # {DATA} field. loc = f"{sec}{sub}{lseq}" else: # Non-parag data line - ignore: if re.search(r"([<][%$][>])", text): data_error(nline,line, f"spurious alignment marker '{m.group(1)}'") m = re.search(r"([^-,.a-z?]", text) if m != None: data_error(nline,line, f"invalid char '{m.group(1)}'")