# Last edited on 2026-02-20 14:29:52 by stolfi
 
# Outputs a list of all tuples {tsize} consecutive words,
# ignoring those that contain words that contain '*'.
#
# For each tuple of {tsize} consecutive words in the same line of the input,
# writes {tsize+1} lines in the output with the format 
# 
#    "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}"
# 
# where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR}
# are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line
# where the tuple occurs; and {KW} is the index of the tuple's first
# word in the input line.
#
# The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input
# words, separated by '.'. The string {MIDDLE} has at least one word, but 
# {LEFT} and {RIGHT} may be empty.


  if book == "bencao":
    assert sub == "fu"
    enc = "chu" if unit == "ch" else "pys" if unit == "ps" else None
  elif book == "starps":
    assert sub == "fu" or sub == "gd"
    enc = "eva"
  else:
    assert False, f"bad {book = }"
  assert enc is not None, f"bad combo {book = } {unit = }"


    m = re.match(pat_punc, text)
  if enc == "utf":
    # Cleanup consists in deleting the Chinese punctuation:
    for ch in text:
      if debug: err.write(f"!! ch = '{ch}'")
      if re.fullmatch(ch, pat_punc):
        num_ignored += 1
        if debug: err.write(" KO")
      else:
        good_chars.append(ch)
        if debug: err.write(" OK")
      if debug: err.write("\n")
    text = "".join(good_chars)
  elif enc == "eva":
    # Cleanup consists of deleting parag markers and ensuring simple EVA.
    if unit == "ec":
      # Remove all EVA punctuation:
      tlen = len(text)
      text = re.sub(r"[-,.]", "", text)
      num_ignored += tlen - len(text)
    elif unit == "wc":
      # Normalize all punc to single '.':
      tlen = len(text)
      text = re.sub(r"[-,]", ".", text)
      # Normalize punctuation:
      text = re.sub(r"[.][.]+", ".", text)
      text = re.sub(r"^[.]+", "", text)
      text = re.sub(r"[.]+$", "", text)
      num_ignored += tlen - len(text)
    else:
      assert False, f"invalid combo {enc = } {unit = }"


  ???charset = None  # Sets of special hanzi characters (punct, blank, etc.)
  if unit == "ch":
   ???charset = dict()
    set_dir = "langbank/chin"
    ???charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl")
    ???charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl")
    ???charset['symbol']  = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl")
    ???charset['punct']   = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl")
    ???charset['blank']   = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl")


  # Read tables of chinese character sets:
  pat_line = None # Matches a pinyin line, with groups {LOC} and {TEXT}.
  pat_punc = None # Matches pinyin punctuation (excluding blanks).
  pat_word = None # Matches a pinyin word (syllable or compound).
  if unit == "ch":
  elif unit == "ps" or unit == "pj":
    # Patterns for parsing pinyin:
    pat_loc = r"b[1-3][.][1-6][.][0-9][0-9][0-9]"
    pat_line = f"<({pat_loc})>[ ]+(.*)\n"
    pat_punc = r"[.,;()*]"

  else:
    assert False


    pat_sec = r"s[0-2]"                # Section s-number, "s0" to "s2".
    pat_sub = r"[.][0-9]"              # Subsection number, 0 to 9, with '.'.
    pat_lseq = r"[.][0-9][0-9][0-9]"   # , with '.'.
    pat_locid = f"<({pat_sec})({pat_sub})({pat_lseq})>"
      # Is a data line:
      if m.lastindex != 4: prog_error("num fields = %d" % m.lastindex)
      sec = m.group(1)
      sub = m.group(2) 
      lseq = m.group(3)
      text = m.group(4).strip()  # {DATA} field.
    
      
      loc = f"{sec}{sub}{lseq}"

    else:
      # Non-parag data line - ignore:
      if re.search(r"([<][%$][>])", text):
        data_error(nline,line, f"spurious alignment marker '{m.group(1)}'")
      m = re.search(r"([^-,.a-z?]", text)
      if m != None:
        data_error(nline,line, f"invalid char '{m.group(1)}'")