#! /bin/python3
# Last edited on 2026-04-28 13:09:48 by stolfi

# Functions for aligning micro-parsed SBJ entries with SPS word text.

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
import size_position_funcs as spf
import write_parsing_funcs as wpf
from math import sqrt, hypot, exp, log, pi, inf, nan, floor, ceil

def align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \
  ( items_tg, items_wh, items_en, segs_ch, text_wc, segs_ec ):
  # Takes 
  #
  #  {items_tg} a list of tag strings for the items of the micro-parsing 
  #    {items_wh} below.
  #
  #  {items_wh} a micro-parsing of an SBJ entry {E} into "items" suitable
  #    for {h77.three_column_entry_table}.
  #
  #  {items_en} a micro-parsing of the English translation of {E},
  #    aligned with {items_wh}.
  #
  #  {segs_ch} a macro-parsing of {E} into alternating gaps and keyword hits.
  #
  #  {text_wc} the text of of an SPS parag {P} that is supposed to be the
  #    translation of {E}, with word separators.
  #
  #  {segs_ec} the macro-parsing of {P} that is supposed to match {segs_ch}.
  #
  # The procedure splits {text_wc} and refines {items_tg}, {items_wh}, {items_en}
  # producing four parallel lists {bites_tg}, {bites_wc}, {bites_wh}, {bites_en} of
  # strings, such that 
  #
  #   {bites_tg[ib]} is a tag assigned to {bites_wh[ib]} (see below)
  #
  #   {bites_wh[ib]} is some item from {items_wh[0..nt-1]}, or a piece
  #     thereof;
  #
  #   {bites_wc[ib]} is the approximate EVA word or words corresponding
  #     to the hanzi characters in {bites_wh[ib]}
  #
  #   {bites_en[ib]} is the English translation of {bites_wh[ib]}.
  #
  # The strings in the micro-parsing {items_wh} of {E} must consist of
  # hanzi characters possibly interspersed with ideographic punctuation
  # and ideographic spaces.
  #
  # The
  # strings {segs_ch[0.ns-1]} must contain only hanzi without any
  # punctuation. Likewise the strings {segs_ec[0..ns-1]} must contain
  # only EVA letters in [a-z?], without any EVA spaces or other markup.
  #
  # The SPS text {text_wc} must be the concatenation of the EVA strings
  # {segs_ec[0..ns-1]}, but split into words by space markers [.,-].
  #
  # The the concatenation {macrotx_ch} of {segs_ch[0..ns-1]} must be a reduced version of the
  # concatenation {microtx_ch} of the strings {items_wh[0..nt-1]} with
  # punctuation removed. The difference may be some items of {E} that
  # are assumed to be omitted from the translation {P}. Thus the
  # procedure first excludes from {items_wh} any items that contains no
  # hanzi characters or have no counterpart in {segs_ch}. (Each
  # item must be either entirely preserved or entirely omitted). It also
  # excludes the corresponding items from {items_tg} and {items_en}. After this step
  # {macrotx_ch} and {microtx_ch} will be equal, and every item in
  # {items_wh} will contain some hanzi character.
  # 
  # Let now {nt} be the number of items in the micro-parsing {items_wh}
  # of {E} after this cleanup. Let {ns} be the number of segments in
  # {segs_ch} and {segs_ec}, which must be odd, and let {ng,nh} be the
  # number of gaps and hits in the macro-parsing {segs_ch} (so that {ng
  # = nh+1} and {ns =ng+nh}.
  #
  # The procedure first splits the list {items_wh} into lists
  # {bites_wh_segs[0..ns-1]} whose elements are elements of {items-ch}
  # or parts thereof.
  #
  # Then the procedure splits {text_wc} into a macro-parsing {segs_wc}
  # such that each gap or hit string has the same EVA letters as its
  # counterpart in {segs_ec}.
  #
  # Then it splits each string {segs_wc[ks]} with odd {ks} into a
  # list {bites_wc_segs[ks]} of strings at word
  # separators, so that the number of pieces is the same as the number
  # of pieces in {bites_wh_segs[ks]}, and the number of EVA letters in
  # each piece is as proportional as possible to the number of hanzi in
  # the corresponding piece of {bites_wh_segs[ks]}.
  #
  # Then finally it joins the lists {bites_wc_segs[0.ns-1]} into a list
  # {bites_wc} of {nt} strings. Ditto for {bites_wh_segs} giving
  # {bites_wh}, {bites_tg_segs} giving {bites_tg}, and {bites_en_segs}
  # giving {bites_en}.
  # 
  # The returned result is the compatibilized lists {bites_tg, bites_wh, bites_wc, bites_en}.
  
  debug = False
  
  ns = len(segs_ch)
  nh = ns//2; ng = nh + 1; assert ns == ng + nh
  assert len(segs_ec) == ns
  
  def join_segs_bites(bites_segs):
    # Given {ns} lists of bites {bites_segs[0..ns-1]},
    # returns a single list that is the concatenation of those lists.
    assert len(bites_segs) == ns
    bites = [ seg for segs in bites_segs for seg in segs ]
    return bites
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  def copy_indentation_of_bites_from_bites_wh(bites_wh, bites_aa):
    # Copies indentations from {bites_wh} to {bites_aa},
    # using one '··' for each '　'.

    nb = len(bites_wh);
    assert len(bites_aa) == nb
    for ib in range(nb):
      bites_aa[ib] = copy_bite_indentation(bites_wh[ib], bites_aa[ib])
    return
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  if debug:
    err.write(f"!$ ### input micro-parsing tg, wh, en ###\n")
    write_tg_wh_wc_en_micro_parsings(err, "!$", items_tg, items_wh, None, items_en)
    err.write(f"!$ ### input macro-parsing ch, ec ###\n")
    write_wh_ec_wc_macro_parsings(err, "!$", segs_ch, segs_ec, None)
    err.write(f"!$ ### input parag text wc ###\n")
    err.write(f"!$ {text_wc = }\n")
  
  if debug: err.write(f"!$ removing omitted items ...\n")
  items_wh, items_tg, items_en = remove_omitted_bencao_items \
    (items_wh, items_tg, items_en, segs_ch)

  if debug:
    err.write(f"!$ after removal:\n")
    write_tg_wh_wc_en_micro_parsings(err, "!$", items_tg, items_wh, None, items_en)
  
  nt = len(items_wh)
  assert len(items_en) == nt
  
  if debug:
    err.write(f"### input segs: ###\n")
    write_wh_ec_wc_macro_parsings(err, "!$", segs_ch, segs_ec, None)

  if debug: err.write(f"!$ Splitting items per hanzi segs ...\n")
  bites_wh_segs, bites_tg_segs, bites_en_segs = \
    split_micro_parsings_tg_wh_en_per_macro_parsing_ch \
      ( items_wh, items_tg, items_en, segs_ch)
  if debug:
    err.write(f"!$ Split items:\n")
    write_tg_wh_wc_en_macro_micro_parsings \
      ( err, '!$', bites_tg_segs, bites_wh_segs, None, bites_en_segs )
  assert len(bites_tg_segs) == ns
  assert len(bites_wh_segs) == ns
  assert len(bites_en_segs) == ns
    
  if debug: err.write(f"!$ Splitting the puntuacted starps text as per macro EVA parsing ...\n")
  segs_wc = split_text_wc_per_macro_parsing_ec(text_wc, segs_ec)
  if debug:
    err.write(f"!$ split starps text:\n")
    write_wh_ec_wc_macro_parsings(err, "!$", None, segs_ec, segs_wc)
  assert len(segs_wc) == ns
  
  if debug: err.write(f"!$ Splitting the starps punctuated EVA segs as per items ...\n")
  bites_wc_segs = []
  for ks in range(ns):
    if debug: err.write(f"!$ wc segment = {segs_wc[ks]!r} wh segment = {bites_wh_segs[ks]!r}\n")
    bts_wc = split_segment_wc_proportionally_to_micro_parsing_wh(segs_wc[ks], bites_wh_segs[ks])
    if debug: err.write(f"!$ {bts_wc = !r}\n\n")
    assert len(bts_wc) == len(bites_wh_segs[ks])
    bites_wc_segs.append(bts_wc)
    
  if debug:
    write_tg_wh_wc_en_macro_micro_parsings \
      ( err, '!$', bites_tg_segs, bites_wh_segs, bites_wc_segs, bites_en_segs )
  
  if debug: err.write(f"!$ Joining lists of bites ...\n")
  bites_tg = join_segs_bites(bites_tg_segs)
  bites_wh = join_segs_bites(bites_wh_segs)
  bites_wc = join_segs_bites(bites_wc_segs)
  bites_en = join_segs_bites(bites_en_segs)
    
  if debug:
    if debug: err.write(f"!$ joined lists:\n")
    write_tg_wh_wc_en_micro_parsings(err, '!$', bites_tg, bites_wh, bites_wc, bites_en)
  
  nb = len(bites_wh)
  assert len(bites_tg) == nb
  assert len(bites_wc) == nb
  assert len(bites_en) == nb
  
  if debug: err.write(f"!$ copying indentation ...\n")
  copy_indentation_of_bites_from_bites_wh(bites_wh, bites_wc)
  copy_indentation_of_bites_from_bites_wh(bites_wh, bites_en)

  if debug:
    if debug: err.write(f"!$ final items lists:\n")
    write_tg_wh_wc_en_micro_parsings(err, '!$', bites_tg, bites_wh, bites_wc, bites_en)
  
  return bites_tg, bites_wh, bites_wc, bites_en
  # ----------------------------------------------------------------------
      
def copy_bite_indentation(bwh, baa):      
  # Copies indentations from string {bwh} to string {baa},
  # using one '··' for each '　'.
  if baa == "": baa = "-"
  if baa != "-":
    kwh = 0
    while kwh < len(bwh) and bwh[kwh] == '　':
      baa = '··' + baa; kwh += 1
  return baa
  # ----------------------------------------------------------------------

def remove_omitted_bencao_items(items_wh, items_tg, items_en, segs_ch):
  # Given lists {items_wh,items_tg,items_en} of {nt} strings and 
  # a macro-partition {segs_ch} of a pure hanzi string, 
  # removes from the former lists any entries that are omitted in the latter.
  # 
  # Each string in {items_wh} must consist of hanzi characters and hanzi punctualtion.
  # The strings in the two other lists are arbitrary Unicode strings.
  # 
  # The strings {segs_ch} must consiste of hanzi characters only.
  # 
  # The concatenation {macrotx_ch} of {segs_ch} must be a 
  # subsequence of the concatenation {microtx_ch} of {ìtems_wh} with punctuation removed.
  # 
  # For each {it}, checks whether the string {bch[it]} of hanzi in
  # {items_wh[it]}, with punctuation removed, is present or missing at
  # the right place of {macrotx_ch}. If not, removes element {it} from
  # the three lists {items_wh, items_tg, items_en}. Assumes that 
  # either the whole string {bch[it]} is present or the whole string is absent.
  # 
  # Returns the new versions of {items_wh,items_tg,items_en}
  
  debug = False
  
  nt = len(items_wh); assert len(items_en) == nt

  text_ch = "".join(segs_ch)
  text_ch = re.sub(r"[，。；：［］（）　～]", "", text_ch)
  if debug: err.write(f"!! {text_ch = !r}\n")
  
  items_wh_new = []
  items_tg_new = []
  items_en_new = []
  kch = 0; # Finger into {text-ch}.
  for it in range(nt):
    item_wh = items_wh[it]
    item_ch = re.sub(r"[，。；：［］（）　～]", "", item_wh)
    mch = len(item_ch)
    tbit_ch = text_ch[kch:kch+mch]
    if debug: err.write(f"!! {item_wh = !r} {item_ch = !r} {kch = } {tbit_ch = !r}\n")
    if item_ch == tbit_ch:
      items_wh_new.append(item_wh);
      items_tg_new.append(items_tg[it]);
      items_en_new.append(items_en[it])
      kch += mch
      if debug: err.write(f"!! appended {kch = }\n")
    else:
      # omit the item:
      if debug: err.write(f"!! omitted {kch = }\n")
      pass
  if kch != len(text_ch):
    err.write(f"!! {items_wh_new = !r}\n")
    err.write(f"!! leftover = {text_ch[kch:]}\n")
    assert False, "segs_ch not contained in items_wh"
  return items_wh_new, items_tg_new, items_en_new
  # ----------------------------------------------------------------------

def split_micro_parsings_tg_wh_en_per_macro_parsing_ch(items_wh, items_tg, items_en, segs_ch):
  # Parameters: 
  #
  #   {items_wh} a micro-parsing of a punctuated SBJ entry {E} into
  #     items suitable for a column of an aligned table
  #
  #   {items_tg} a list of tags for the items in {items_wh},
  #
  #   {items_en} a list of English translations for the items of {items_wh}
  #
  #   {segs_ch} a macro-parsing of the pure hanzi text of {E} into alternating
  #     gaps and hits
  #
  # Let {nh = ns//2} be the number of hits and {ng=nh+1} the number of gaps in {segs_ch}.
  # Let {nt} be the number of items in {items_wh}, which must be the same for {items_tg}
  # and {items_en}. 
  # 
  # Each string {items_wh[0..nt-1]} must consist of hanzi characters
  # possibly interspersed with ideographic punctuation only.
  #
  # The strings {items_tg[0..nt-1]} and {items_en[0..nt-1]} are arbitrary 
  # ASCII strings. 
  #
  # Each element {segs_ch[0..ns-1} must be a string of pure hanzi
  # characters, without ideographic punctuation or other characters.
  #
  # The concatenation of {segs_ch[0..ns-1} must be a string of pure hanzi characters
  # (without punctuation) that is equal to the concatenation of all
  # items {item_wh[0..nt-1]} with all the punctuation removed.
  #
  # The procedure splits the list of strings {items_wh} into {ns} lists
  # {bites_wh_segs[0..ns-1]} such that each list {bites_wh_segs[ks]}
  # contains the same hanzi as {segs_ch[ks]}.
  #
  # For that it may have to breaks some items of {items_wh} into two or
  # more /bites/ (substrings). Whenever it does so, it marks the split
  # with a '～' on both sides of the cut, and replicates the indentation
  # of the original item (realized as zero or more ideographic spaces '
  # 　') onto the second bite.
  #
  # The procedure also splits the items of {items_tg} into separate
  # lists {bites_tg_segs[0..ns-1]} that are paralell to {bites_wh_segs}.
  # It similarly splits the items of {items_en} into lists
  # {bites_en_segs[0..ns-1]}. These bites receive the same indentation
  # as those of {bites_wh_segs}, but using '··' instead of '　'.
  #
  # If {segs_ch[ks]} is an empty string, the corresponding list
  # {bites_wh_segs[ks]} will have a single element, the empty string;
  # and ditto for {bites_tg_segs[ks]} and {bites_en_segs[ks]}.
  #
  # The procedure returns the three lists of lists
  # {bites_wh_segs[0..ns-1]}, {bites_tg_segs[0..ns-1]},
  # {bites_en_segs[0..ns-1]}.
  # 
  # If a string of {bites_wh_segs[ks]} is only part of an element of
  # {items_wh}, that fact is indicated by an appended or prepended
  # "hyphenation mark" '～'. In that case, any leading '　's in the part
  # before the split are duplicated on the part after the split.
  
  debug = False
  
  ns = len(segs_ch); 
  nh = ns//2; ng = nh+1; assert ns == ng + nh

  nt = len(items_wh)
  assert len(items_tg) == nt
  assert len(items_en) == nt
  
  punct_wh = set(r"，。；：　～")
  brack_wh = set(r"［］（）")
  hyphen_wh = '～'
 
  bites_wh_segs = []
  bites_tg_segs = []
  bites_en_segs = []
  # Leftover bits of prev item:
  rest_tg = ""
  rest_wh = ""
  rest_en = ""
  it = 0 # Next unused item is {items_wh[it]}.
  for ks in range(ns):
    # Collect lists {bts_wh,bts_tg,bts_en} of bits of items to match {segs_ch[ks]}:
    bts_wh, rest_wh, \
    bts_tg, rest_tg, \
    bts_en, rest_en, \
    it = gobble_up_bites_of_items \
      ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en, 
        it, punct_wh, brack_wh, segs_ch[ks], hyphen_wh
      )
    bites_wh_segs.append(bts_wh)
    bites_tg_segs.append(bts_tg)
    bites_en_segs.append(bts_en)
    
  def copy_indent_at_split(bits, kb, hyphen):
    # Takes a list of lists of strings {bits}.
    # If {bits[kb][-1][-1]} and {bits[kb + 1][0][0]} are {hiphen}, 
    # copies the indentation of {bits[kb][-1]} onto {bits[kb + 1][0]}.
    if debug: err.write(f"!# {bits = !r} {kb = }\n")
    nb = len(bits)
    if kb < 0 or kb+1 >= nb: return
    bs0 = bits[kb]; bs1 = bits[kb + 1];
    if debug:
      err.write(f"!# {bs0 = !r}\n")
      err.write(f"!# {bs1 = !r}\n")
    assert isinstance(bs0, list) and isinstance(bs1, list)
    m0 = len(bs0); m1 = len(bs1);
    if m0 == 0 or m1 == 0: return
    b0 = bs0[-1]; b1 = bs1[0]
    if debug:
      err.write(f"!# {b0 = !r}\n")
      err.write(f"!# {b1 = !r}\n")
    assert isinstance(b0, str) and isinstance(b1, str)
    if b0 == "" or b1 == "": return
    split = (b0[-1] == hyphen)
    assert split == (b1[0] == hyphen)
    if not split: return
    k = 0; 
    while k < len(b0) and b0[k] == '　':
      b1 = '　' + b1; k += 1
    bs1[0] = b1
    if debug:
      err.write(f"!# {b1 = !r}\n")
      err.write(f"!# new {bs1 = !r}\n")
      err.write(f"!# new {bits = !r}\n")
      err.write(f"!# ----------------------------------------\n")
    return 
    # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  # Fix indentation after splits:
  for ks in range(ns):
    copy_indent_at_split(bites_wh_segs, ks, hyphen_wh)

  assert rest_wh == "" and it == nt
  return bites_wh_segs, bites_tg_segs, bites_en_segs
  # ----------------------------------------------------------------------

def split_text_wc_per_macro_parsing_ec(text_wc, segs_ec):
  # Given the text {text_wc} of an SPS parag {P} with word
  # separators, and a macro-parsing {segs_ec[0..ns-1]} of the EVA letters
  # of {P} into segments, splits {text_wc} into
  # strings {segs_wc[0..ns-1]} with the same EVA letters.
  #
  # Let {nh = ns//2} be the number of hits in the macro-parsing
  # {segs_ec}, and {ng = nh+1} the number of gaps in it. Each element
  # {segs_ec[0..ns-1} must be a strings of pure EVA letters [a-z?],
  # without word spaces or other characters.
  # 
  # The string {text_wc} must consist of EVA letters [a-z?] possibly
  # interspersed with EVA punctuation only.
  #
  # The procedure returns a partition of {text_wc} into segments
  # {segs_wc[0..ns-1]} possibly with hyphenation marks (see below). The
  # concatenation of all the strings of {segs_wc[0..ns-1]}, minus the
  # punctuation characters, will be equal to the concatenation of all
  # strings {segs_ec[0..ns-1]}.
  #
  # If a string {segs_wc[ks]} begins or ends with only part of a word
  # of {text_wc}, that fact is indicated by an appended or prepended
  # "hyphenation mark" '~'.
  
  debug = False
 
  ns = len(segs_ec)
  nh = ns//2; ng = nh + 1; assert nh + ng == ns
  
  # Just in case:
  text_wc = re.sub(r"<[a-z0-9.]+>", "", text_wc)
  text_wc = re.sub(r"[ <$%>]", "", text_wc)
  
  # Normalize word separators:
  text_wc = re.sub(r"^[,.-]", "", text_wc)
  text_wc = re.sub(r"[,.-]$", "", text_wc)
  text_wc = re.sub(r"[,.-]", ".", text_wc)

  # Count total EVA letters{nec}  in {segs_ec}:
  nec = 0
  for sg_ec in segs_ec: nec += len(sg_ec)
  
  segs_wc = []
  rest_wc = text_wc # Leftover bit of {text_wc}
  kec = 0 # Index of next unmatched char in {segs_ec}.
  
  def grab_chars(tec, aggressive):
    nonlocal rest_wc, kec
    rest_ec = tec
    twc = ""
    while True:
      if rest_wc == "":
        assert rest_ec == "", f"unmatched gap/hit chars {rest_ec = !r}"
        break
      cwc = rest_wc[0]; 
      if cwc == '.':
        if rest_ec == "" and not aggressive: break
        if debug: err.write(f"!: {twc = !r} {rest_wc = !r}\n")
      else:
        if rest_ec == "": break
        cec = rest_ec[0]; rest_ec = rest_ec[1:]
        if debug: err.write(f"!: {cwc = !r} {cec = !r}\n")
        assert cwc == cec, f"wc/ec mismatch"
      twc += cwc; rest_wc = rest_wc[1:]
      if debug: err.write(f"!: {twc = !r} {rest_wc = !r}\n")
    kec += len(tec)
    return twc
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  for ks in range(ns):
    # Collect chars from {rest_wc} to match {segs_ec[ks]}:
    aggressive = (ks % 2 == 0)
    sg = grab_chars(segs_ec[ks], aggressive)
    if debug: err.write(f"!: {sg = !r} {rest = !r}\n")
    segs_wc.append(sg)

  assert rest_wc == "" and kec == nec
  
  # Add "hyphens" '~' between split words:
  for ks in range(ns-1):
    # Check gap between segs {ks} and {ks+1}:
    js = ks + 1
    char1 = '.' if segs_wc[ks] == "" else segs_wc[ks][-1]
    char2 = '.' if segs_wc[js] == "" else segs_wc[js][0]
    if char1 != '.' and char2 != '.':
      segs_wc[ks] = segs_wc[ks] + '~'
      segs_wc[js] = '~' + segs_wc[js] 
  
  return segs_wc
  # ----------------------------------------------------------------------

def gobble_up_bites_of_items \
  ( rest_aa, items_aa, rest_bb, items_bb, rest_cc, items_cc, \
    it, punct, brack, pure_aa, hyphen
  ):
  # Scans the string {rest_aa} and the strings {items_aa[it:]} building a
  # list {bites_aa} of items or bits of items until collecting all the 
  # characters in the string {pure_aa}, and any punctuation characters 
  # that may be interspersed with them.
  #
  # Characters in {rest_aa} and {items_aa[it:]} that do not match the 
  # string {pure_aa} must be in the set {punct}(punctuation) or
  # or {brack} (parenthesis-like delimiters)
  #
  # Keeps gobbling up characters that are in {punct} (but not {brack} even
  # after it collects all the characters in {pure_aa}.
  #
  # If the first bit does not start at an item boundary, prepends
  # {hyphen} to it. If the last bit does not end at an item boundary,
  # appends {hyphen} to it.
  #
  # If {rest_bb} and {items_bb} are not {None}, also gobbles up a list
  # {bites_bb} of items and bits of items from them so as to match
  # {bites_aa}. If these parameters are {None}, sets {bits_bb} to
  # {none}. Ditto for {rest_cc} and {items_cc}, yielding {bites_cc}.
  #
  # If a list {bites_aa,bites_bb,bites_cc} would be empty, 
  # sets it to a singleton list with an empty string instead.
  #
  # Returns {bites_aa, rest_aa, bites_bb, rest_bb, bites_cc, rest_cc, it} 
  # where {rest_aa, rest_bb, rest_cc, it} are the updated values of
  # those input parameters.
  
  debug = False

  def get_next_character_from_items_aa():
    nonlocal rest_aa, items_aa, it
    # Gets the next character {char_aa} from the string {rest_aa} or
    # from {items_aa[it]}. Returns {char_aa} and the new values for
    # {rest_aa} and {it} (but does not update them)
    
    if rest_aa == "":
      assert it < nt, f"ran out of items_aa after {kp} of {np} chars"
      rest1_aa = items_aa[it]
      it1 = it + 1
    else:
      rest1_aa = rest_aa
      it1 = it
    assert isinstance(rest1_aa, str) and rest1_aa != ""
    char_aa = rest1_aa[0]; rest1_aa = rest1_aa[1:];
    return char_aa, rest1_aa, it1
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  if debug: 
    err.write(f"!+ {rest_aa = !r} {items_aa = !r}\n")
    err.write(f"!+ {rest_bb = !r} {items_bb = !r}\n")
    err.write(f"!+ {rest_aa = !r} {items_aa = !r}\n")
    err.write(f"!+ {it = }\n")
    err.write(f"!+ {pure_aa = !r}\n")

  nt = len(items_aa)
  np = len(pure_aa)
  
  # Items of {items_aa, items_bb, items_cc} and chunks thereof that match {pure_aa}:
  bites_aa = [] 
  bites_bb = None if items_bb == None else [] 
  bites_cc = None if items_cc == None else [] 
  
  # Next incomplete strings to add to {bites_aa, bites_bb, bites_cc}.
  bt_aa = ""    
  bt_bb = None if items_bb == None else ""
  bt_cc = None if items_cc == None else ""

  kp = 0  # Next character of {pure_aa} to be matched.

  if rest_aa != "": bt_aa = hyphen
  while rest_aa != "" or it < nt:
    if debug: err.write(f"!+ -- iteration --\n")
    if debug: err.write(f"!+ {rest_aa = !r} {it = } {rest_bb = !r} {rest_cc = !r} {it = }\n")
    if debug: err.write(f"!+ {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n")
    if rest_aa == "" and bt_aa != "":
      # Item break, break {bites_aa, bites_bb, bites_cc} too:
      if debug: err.write(f"!+ item break ...\n")
      bites_aa.append(bt_aa); bt_aa = ""
      if items_bb != None: bites_bb.append(bt_bb); bt_bb = ""
      if items_cc != None: bites_cc.append(bt_cc); bt_cc = ""
      if debug: err.write(f"!+ {bites_aa = !r} {bites_bb = !r} {bites_cc = !r}\n")
      if debug: err.write(f"!+ {rest_aa = !r} {it = } {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n")
    if debug: err.write(f"!+ getting next char ...\n")
    char_aa, rest1_aa, it1 = get_next_character_from_items_aa()
    if debug: err.write(f"!+ {char_aa = !r} {rest1_aa = !r} {it1 = }\n")
    gobble = kp < np or char_aa in brack or (char_aa in punct and bt_aa != "")
    if debug: err.write(f"!+ {gobble = :1b} {kp = } pure_aa[kp] = { pure_aa[kp] if kp < np else '' !r}\n")
    if gobble:
      # Gobble {char_aa}:
      bt_aa = bt_aa + char_aa; 
      rest_aa = rest1_aa;
      if char_aa not in punct and char_aa not in brack:
        pchar = pure_aa[kp]
        assert kp < np and char_aa == pchar, f"unexpected char_aa {char_aa} != {pchar} at {kp}"
        kp += 1
      if items_bb != None and bt_bb == "": 
        bt_bb = rest_bb; rest_bb = "" if it1 == it else items_bb[it]
      if items_cc != None and bt_cc == "": 
        bt_cc = rest_cc; rest_cc = "" if it1 == it else items_cc[it]
      it = it1
      if debug: err.write(f"!+ {rest_aa = !r} {rest_bb = !r} {rest_cc = !r} {it = }\n")
      if debug: err.write(f"!+ {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n")
    else:
      # Stop here, leave {char_aa} for the next hit:
      if rest_aa != "": 
        bt_aa = bt_aa + hyphen;
      break
    if debug: err.write(f"!+ {kp = } {bt_aa = !r} {bt_bb = !r} {bt_cc = !r}\n")
    if debug: err.write(f"!+ {bt_aa = !r} {rest_aa = !r} {it = }\n")
  if debug: err.write(f"!+ -- end loop --\n")

  if bt_aa != '' or bt_bb != '' or  bt_cc != '' or len(bites_aa) == 0:
    bites_aa.append(bt_aa)
    if items_bb != None: bites_bb.append(bt_bb)
    if items_cc != None: bites_cc.append(bt_cc)

  return bites_aa, rest_aa, bites_bb, rest_bb, bites_cc, rest_cc, it
  # ----------------------------------------------------------------------

def split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh):
  # Takes a fragment {text_wc} of EVA text with words separated by EVA
  # word spaces [.,-], and a list {bites_wh[0..nb-1]} of strings with
  # hanzi characters and hanzi punctuation. Splits {text_wc} into a list
  # {bites_wc[0..nb-1]} of substrings at word boundaries, so that, for
  # each {ib} the total EVA letters in {bites_wc[0..ib]}, not counting
  # punctuation, is approximately proportional to the number of hanzin
  # in {bites_wh[0..ib]}, not counting hanzi punctuation.
  
  debug = False
  # debug = (text_wc == "apo")
  
  nb = len(bites_wh)
  
  text_ec = re.sub(r"[ .,~-]", "", text_wc); nec = len(text_ec)
  text_ch = re.sub(r"[，。；：［］（）　～]", "", "".join(bites_wh)); nch = len(text_ch)
  ec_per_ch = (nec + 0.000001)/(nch + 0.000001)
  
  def get_next_ec_token(rest_wc):
    # Splits off the next non-empty EVA token from {rest_wc}.
    # The token will have only EVA letters [a-z?] or '~'.
    # Returns the token and the remaining {rest_wc}.
    # Returns "" if there is none such.
    tok_ec = ""
    while rest_wc != "":
      m = re.fullmatch(r"([a-z?~]*)[.,-]+(.*)", rest_wc)
      if m == None:
        tok_ec = re.sub(r"[.,-]", "", rest_wc); 
        rest_wc = ""
      else:
        tok_ec = m.group(1); rest_wc = m.group(2)
      if tok_ec != "": break
    return tok_ec, rest_wc
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  rest_wc = text_wc

  def gobble_wc_tokens(tch, tec):
    # Take EVA words from {rest_wc} until best equalizing the {tec} count
    # to the equivalent of {tch}.
    # Returns a bite of {rest_wc} that is equivalent to {bch}.
    # in terms of EVA letter counts vs. hanzi counts.
    # Updates {rest_wc}.

    nonlocal rest_wc
    
    bwc = ""  # next bite for {bites_wc}.
    mec = 0   # Count of EVA letters in {bwc}.
    tec_est = tch * ec_per_ch
    
    while True:
      if debug: 
        err.write(f"!¤ --- inner loop ---\n")
        err.write(f"!¤ {rest_wc = !r}\n")
        err.write(f"!¤ {bwc = !r} {mec = }\n")
      if debug: err.write(f"!¤ getting next token from wc text:\n")
      tok_ec, rest_wc = get_next_ec_token(rest_wc)
      if debug: err.write(f"!¤ {tok_ec = !r} {rest_wc = !r}\n")
      if tok_ec == "": 
        assert rest_wc == ""; break
      else:
        # Shall we take {tok_ec} or end {bwc} here?
        rec = len(re.sub(r"^~", "", tok_ec))
        rec = len(re.sub(r"~$", "", tok_ec))
        tec0 = tec + mec           # Next value of {tec} if we don't take this token.
        tec1 = tec + mec + rec     # Next value of {tec} if we take this token.
        take = abs(tec1 - tec_est) <= abs(tec0 - tec_est)
        if debug: err.write(f"!¤ {tec = } {mec = } {rec = }\n")
        if debug: err.write(f"!¤ {tec_est = } {tec0 = } {tec1 = } {take = :b}\n")
        if take:
          # Add {tok_ec} to current bite:
          if bwc != "": bwc += '.'
          bwc += tok_ec; mec += rec
        else:
          # Put {tok_ec} back and exit inner loop.
          if rest_wc != "": rest_wc = '.' + rest_wc
          rest_wc = tok_ec + rest_wc; 
          break
    return bwc, mec
    # ....................................................................

  bites_wc = []
  totct_ch = 0 # Count of hanzi so far in {bites_wh}.
  totct_ec = 0 # Count of EVA chars so far in {bites_wc}.
  
  for ib in range(nb):
    # Count hanzi in {bites_wh[ib]}:
    bwh = bites_wh[ib]
    bch = re.sub(r"[，。；：［］（）　～]", "", bwh)
    mch = len(bch)
    if debug: 
      err.write(f"!¤ @@@ outer loop @@@\n")
      err.write(f"!¤ {totct_ch = } {bites_wc = !r}\n")
      err.write(f"!¤ {rest_wc = !r}\n")
      err.write(f"!¤ {bwh = !r} {bch = !r} {mch = }\n")
    totct_ch = totct_ch + mch
    bwc, mec = gobble_wc_tokens(totct_ch, totct_ec)
    bites_wc.append(bwc); totct_ec += mec  
    
  if rest_wc != "":
    # Force gobbling of any final leftover:
    bwc = "" if len(bites_wc) == 0 else bites_wc[-1]
    bwc += rest_wc;
    if len(bites_wc) == 0:
      bites_wc.append(bwc)
    else:
      bites_wc[-1] = bwc
    rec = re.sub(r"[.,-~]", "", rest_wc)
    totct_ec += len(rec)

  # Replace empty bites by "-" or "
  for ib in range(len(bites_wc)):
    if bites_wc[ib] == "": bites_wc[ib] = "-"

  return bites_wc
  # ----------------------------------------------------------------------
  
def write_wh_ec_wc_macro_parsings(wr, lab, segs_wh, segs_ec, segs_wc):
  # Writes three macro-parsings {segs_wh,segs_ec,segs_wc} as the two 
  # columns of a table.  Assumes that the elements of {segs_wh} are hanzi strings
  # possibly with ideographic punctuation, while those of {segs_wc} are 
  # Latin or pinyin strings in Unicode.
  wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
  mpars = ( \
      ( segs_wh, "？", '　', ), 
      ( segs_ec, "??", ' ', ),
      ( segs_wc, "??", ' ', ),
    )
  wfn.write_parsings(wr, lab, True, mpars)
  wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
  return
  # ----------------------------------------------------------------------
  
def write_tg_wh_wc_en_micro_parsings(wr, lab, bites_tg, bites_wh, bites_wc, bites_en):
  # Writes four micro-parsings {ist_tg,bites_wh,bites_wc,bites_en} as the four 
  # columns of a table.  Assumes that the elements of {bites_wh} are hanzi strings
  # possibly with ideographic punctuation, while the others are 
  # Latin or pinyin strings in Unicode.
  wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
  mpars = ( \
    ( bites_tg, "??", ' ' ),
    ( bites_wh, "？", '　' ),
    ( bites_wc, "??", ' ' ),
    ( bites_en, "??", ' ' ),
  )
  wfn.write_parsings(wr, lab, False, mpars)
  wr.write(f"{lab} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
  return
  # ----------------------------------------------------------------------

def write_tg_wh_wc_en_macro_micro_parsings \
  ( wr, lab, segs_bites_tg, segs_bites_wh, segs_bites_wc, segs_bites_en ):
  wr.write(f"{lab} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
  mpars = ( \
      ( segs_bites_tg, "",   ' ', ),
      ( segs_bites_wh, "？", ' ', ),
      ( segs_bites_wc, "??", ' ', ),
      ( segs_bites_en, "??", ' ', ),
    )
  wfn.write_bilevel_parsings(wr, lab, mpars)
  
  wr.write(f"{lab} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
  return
  # ----------------------------------------------------------------------

def test_stuff():
  err.write("TESTING\n")
  
  err.write("----------------------------------------\n")
  test_split_segment_wc_proportionally_to_micro_parsing_wh()
  # err.write("----------------------------------------\n")
  # test_align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec()
  # err.write("----------------------------------------\n")
  # test_split_text_wc_per_macro_parsing_ec()
  # err.write("----------------------------------------\n")
  # test_gobble_up_bites_of_items()
  # err.write("----------------------------------------\n")
  # test_split_micro_parsings_tg_wh_en_per_macro_parsing_ch()
  err.write("----------------------------------------\n")

  return
  # ---------------------------------------------------------------------- 

def test_split_segment_wc_proportionally_to_micro_parsing_wh():

  text1_wc = 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo'
  bites1_wh = [ '　　补虚，　　　　　', '　　杀毒，　　　　　', '　　　辟～　　　　　' ]
  exp_bites1_wc = [ 'rair.apy.okeey', 'qokaiin.or.aiir.al', 'dal.sheeo', ]
  do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text1_wc, bites1_wh, exp_bites1_wc) 
  
  text2_wc = 'chedy.qokeed.okain.chdy.laiin.ofar.chedy.tedam.okeedy.lkal.daiin.yk~'
  bites2_wh = [ '风寒湿痹，　', '　　乳难，　', '　　消水，', '　　养五脏，　　', '　　益～' ]
  exp_bites2_wc = [ 'chedy.qokeed.okain.chdy', 'laiin.ofar', 'chedy.tedam', 'okeedy.lkal', 'daiin.yk~'  ]
  do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text2_wc, bites2_wh, exp_bites2_wc) 
  return
  # ----------------------------------------------------------------------

def do_test_split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh, exp_bites_wc):
  err.write("=== testing split_segment_wc_proportionally_to_micro_parsing_wh ===\n")

  err.write(f"### input items ###\n")
  err.write(f"!% {text_wc = !r}\n")
  err.write(f"!% {bites_wh = !r}\n")
  
  err.write(f"!% splitting words proportionally to hanzi bites...\n")
  bites_wc = split_segment_wc_proportionally_to_micro_parsing_wh(text_wc, bites_wh)
  
  err.write(f"!% split words:\n")
  err.write(f"!% {bites_wc = !r}\n")

  text_wc = re.sub(r"[,.-][,.-]+", ".", text_wc)
  text_wc = re.sub(r"^[,.-]+", "", text_wc)
  text_wc = re.sub(r"[,.-]+$", "", text_wc)

  tchk_wc = ".".join(bites_wc) + '.'
  tchk_wc = re.sub(r"[,.-][,.-]+", ".", tchk_wc)
  tchk_wc = re.sub(r"^[,.-]+", "", tchk_wc)
  tchk_wc = re.sub(r"[,.-]+$", "", tchk_wc)

  err.write(f"!% text_wc = {text_wc!r}\n")
  err.write(f"!% tchk_wc = {tchk_wc!r}\n")
  assert text_wc == tchk_wc
  
  assert bites_wc == exp_bites_wc
  return
  # ----------------------------------------------------------------------  

def test_align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec():
  # 

  err.write("=== testing align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec ===\n")

  items_wh_en  = ( \
      ( '(A)    ', '丹雄鸡　　　　', 'Red male chicken',         ), 
      ( '(A1)   ', '　［味］　　　', '[Flavor]',                 ), 
      ( '(A11)  ', '　　甘，　　　', 'sweet,',                   ), 
      ( '(A12)  ', '　　微温。　　', 'slightly warm.',           ), 
      ( '(A3)   ', '　［主治］　　', '[Main uses]',              ), 
      ( '(A31)  ', '　　（女子）　', '(Women)',                  ), 
      ( '(A311) ', '　　　赤白沃；', 'red and white discharge;', ), 
      ( '(A32)  ', '　　补虚，　　', 'bamboo hair,',             ), 
      ( '(A33)  ', '　　杀毒，　　', 'kills bears,',             ), 
      ( '(A34)  ', '　　（女子）　', '(women)',                  ), 
      ( '(A341) ', '　　　辟不祥。', 'cannot knit socks.',       ), 
      ( '(B)    ', '头：　　　　　', 'Head:',                    ), 
      ( '(B3)   ', '　［主］　　　', '[Mainly for]',             ), 
      ( '(B31)  ', '　　杀鬼。　　', 'killing time.',            ), 
      ( '(C)    ', '肪：　　　　　', 'Transistors:',             ), 
    )
  items_tg = [ tri[0] for tri in items_wh_en ]
  items_wh = [ tri[1] for tri in items_wh_en ]
  items_en = [ tri[2] for tri in items_wh_en ]
  nt = len(items_wh)
  
  err.write(f"### input items ###\n")
  write_tg_wh_wc_en_micro_parsings(err, '!!', items_tg, items_wh, None, items_en)

  segs_ch = [ '丹雄鸡', '主治', '赤白', '沃', '补虚杀毒辟不祥头', '主', '杀鬼肪', ]

  text_wc =   'poar.keeo.daiin.qotedair.apo.rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo.daiin.chsd.qokeeey'

  segs_ec = [ 'poar.keeo', 'daiin', 'qotedair', 'apo', 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo', 'daiin',  'chsd.qokeeey' ]
  segs_ec = [ re.sub(r"[,.~-]", "", sg) for sg in segs_ec ]

  err.write(f"### input segs ###\n")
  write_wh_ec_wc_macro_parsings(err, '!!', segs_ch, segs_ec, None)

  err.write(f"!! splitting and aligning...\n")
  bites_tg, bites_wh, bites_wc, bites_en = \
    align_text_wc_with_micro_parsing_tg_wh_en_and_macro_parsing_ec \
    ( items_tg, items_wh, items_en, segs_ch, text_wc, segs_ec)

  write_tg_wh_wc_en_micro_parsings(err, '!!', bites_tg, bites_wh, bites_wc, bites_en)
  
  nb = len(bites_tg)
  assert nb == len(bites_wh)
  assert nb == len(bites_wc)
  assert nb == len(bites_en)
  
  # html_tb = h77.make_three_column_entry_table(st, tags1, bites_wh, bites_wc, bites_en)
  return

def test_split_micro_parsings_tg_wh_en_per_macro_parsing_ch():
  
  err.write("=== testing split_micro_parsings_tg_wh_en_per_macro_parsing_ch ===\n")
  items_tg_wh_en  = ( \
      ( '(A)',     '丹雄鸡　　　　', 'Red rooster:',     ),
      ( '(A1)',    '　［味］　　　', '[Flavor]',         ),
      ( '(A11)',   '　　甘，　　　', 'sweet,',           ),
      ( '(A12)',   '　　微温。　　', 'a bit fuzzy.',     ),
      ( '(A3)',    '　［主治］　　', '[Main uses]',      ),
      ( '(A31)',   '　　（女子）　', '(Women)',          ),
      ( '(A311)',  '　　　赤白沃。', 'lipstick.',        ),
      ( '(A32)',   '　　补虚，　　', 'sawing wood,',     ),
      ( '(A33)',   '　　杀毒，　　', 'fixing cars,',     ),
      ( '(A34)',   '　　辟不祥。　', 'washing floors.',  ),
      ( '(B)',     '头：　　　　　', 'Head:',            ),
      ( '(B3)',    '　［主］　　　', '[Mainly for]',     ),
      ( '(B31)',   '　　杀鬼。　　', 'killing time.',    ),
      ( '(C)',     '肪：　　　　　', 'Pipes:',           ),
    )
  items_tg = [ tri[0] for tri in items_tg_wh_en ]
  items_wh = [ tri[1] for tri in items_tg_wh_en ]
  items_en = [ tri[2] for tri in items_tg_wh_en ]
  err.write(f"### input micro-parsings ###\n")
  write_tg_wh_wc_en_micro_parsings(err, '!=', items_tg, items_wh, None, items_en)

  segs_ch = [ '丹雄鸡味甘微温', '主治', '女子赤白', '沃', '补虚杀毒辟', '不祥头主', '杀鬼肪', ]
  err.write(f"### input macro-parsing of hanzi ###\n")
  write_wh_ec_wc_macro_parsings(err, '!=', segs_ch, None, None)

  # Check the data:
  chicktx_ch = "".join(segs_ch)
  err.write(f"{chicktx_ch = !r}\n")
  chocktx_ch = '丹雄鸡味甘微温主治女子赤白沃补虚杀毒辟不祥头主杀鬼肪'
  assert chocktx_ch == chicktx_ch
  
  ns = len(segs_ch)
  nh = ns//2; ng = nh+1; assert ns == ng + nh
  
  segs_bites_wh, segs_bites_tg, segs_bites_en = \
    split_micro_parsings_tg_wh_en_per_macro_parsing_ch \
      (items_wh, items_tg, items_en, segs_ch)

  write_tg_wh_wc_en_macro_micro_parsings \
    ( err, '!=', segs_bites_tg, segs_bites_wh, None, segs_bites_en )
  
  # Check for preservation of hanzi:
  for ks in range(ns):
    sg_ch = segs_ch[ks]
    ck_ch = re.sub(r"[，。；：［］（）　～]", "", "".join(segs_bites_wh[ks]))
    err.write(f"!= {sg_ch = !r} {ck_ch = !r}\n")
    assert sg_ch == ck_ch
  return
  # ----------------------------------------------------------------------
  
def test_split_text_wc_per_macro_parsing_ec():
  
  err.write("=== testing split_text_wc_per_macro_parsing_ec ===\n")

  text_wc =   'poar.keeo.daiin.qotedair.apo.rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo.daiin.chsd.qokeeey'

  segs_ec = [ 'poar.keeo', 'daiin', 'qotedair', 'apo', 'rair.apy.okeey.qokaiin.or.aiir.al.dal.sheeo',  'daiin', 'chsd.qokeeey' ]
  segs_ec = [ re.sub(r"[,.~-]", "", sg) for sg in segs_ec ]

  ns = len(segs_ec)
  nh = ns//2; ng = nh+1; assert ns == ng + nh

  segs_wc = split_text_wc_per_macro_parsing_ec(text_wc, segs_ec)
  write_wh_ec_wc_macro_parsings(err, "!$", None, None, segs_wc)
  assert len(segs_wc) == ns
  
  chicktx_wc = re.sub(r"[.,-]", "",  text_wc)
  err.write(f"{chicktx_wc = }\n")
  chocktx_wc = re.sub(r"[.~]", "", "".join(segs_wc))
  err.write(f"{chocktx_wc = }\n")
  assert chicktx_wc == chocktx_wc
  return
  # ----------------------------------------------------------------------

def test_gobble_up_bites_of_items():

  err.write("=== testing gobble_up_bites_of_items ===\n")

  items_tg_wh_en  = ( \
      ( '(A)',     '丹雄鸡：　　　', 'Red rooster:',     ),
      ( '(A1)',    '　［味］　　　', '[Flavor]',         ),
      ( '(A11)',   '　　甘，　　　', 'sweet,',           ),
      ( '(A12)',   '　　微温。　　', 'a bit fuzzy.',     ),
      ( '(A3)',    '　［主治］　　', '[Main uses]',      ),
      ( '(A31)',   '　　（女子）　', '(Women)',          ),
      ( '(A311)',  '　　　赤白沃。', 'lipstick.',        ),
      ( '(A32)',   '　　补虚，　　', 'sawing wood,',     ),
      ( '(A33)',   '　　杀毒，　　', 'fixing cars,',     ),
      ( '(A34)',   '　　辟不祥。　', 'washing floors.',  ),
      ( '(B)',     '头：　　　　　', 'Head:',            ),
      ( '(B3)',    '　［主］　　　', '[Mainly for]',     ),
      ( '(B31)',   '　　杀鬼。　　', 'killing time.',    ),
      ( '(C)',     '肪：　　　　　', 'Pipes:',           ),
    )
  items_tg = [ tri[0] for tri in items_tg_wh_en ]
  items_wh = [ tri[1] for tri in items_tg_wh_en ]
  items_en = [ tri[2] for tri in items_tg_wh_en ]
  err.write(f"### input items ###\n")
  write_tg_wh_wc_en_micro_parsings(err, '!=', items_tg, items_wh, None, items_en)

  nt = len(items_wh)

  punct_wh = set(r"，。；：　～")
  brack_wh = set(r"［］（）")
  hyphen_wh = '～'

  err.write(f"============================================================\n")
  pure_wh_1 = "雄丹鸡味甘微"
  rest_wh = '雄丹鸡'
  rest_tg = '(AH)'
  rest_en = 'Crimson rooster'
  it = 1
  bites_wh, rest_wh, \
  bites_tg, rest_tg, \
  bites_en, rest_en, \
  it = gobble_up_bites_of_items \
    ( rest_wh, items_wh, rest_tg, items_tg, rest_en, items_en,  
      it, punct_wh, brack_wh, pure_wh_1, hyphen_wh
    )
  err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n")
  err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n")
  err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n")
  assert bites_wh == [ '～雄丹鸡',         '　［味］　　　', '　　甘，　　　',   '　　微～',        ]
  assert bites_tg == [ '(AH)',            '(A1)',         '(A11)',         '(A12)',        ]
  assert bites_en == [ 'Crimson rooster', '[Flavor]',     'sweet,',        'a bit fuzzy.', ]
  assert rest_wh == "温。　　" and it == 4
  
  err.write(f"============================================================\n")
  pure_wh_2 = "主治"
  rest_wh = ''
  rest_tg = ''
  rest_en = ''
  it = 4
  bites_wh, rest_wh, \
  bites_tg, rest_tg, \
  bites_en, rest_en, \
  it = gobble_up_bites_of_items \
    ( rest_wh, items_wh,  rest_tg, items_tg,  rest_en, items_en, 
      it, punct_wh, brack_wh, pure_wh_2, hyphen_wh )
  err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n")
  err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n")
  err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n")
  assert bites_wh == [ '　［主治］　　',     ]
  assert bites_tg == [ '(A3)',            ]
  assert bites_en == [ '[Main uses]', ]
  assert (rest_wh == "（女子）　" and it == 6) or (rest_wh == "" and it == 5)
  
  err.write(f"============================================================\n")
  pure_wh_3 = "杀鬼肪"
  rest_wh = '　　　'
  rest_tg = ''
  rest_en = ''
  it = 12
  bites_wh, rest_wh, \
  bites_tg, rest_tg, \
  bites_en, rest_en, \
  it = gobble_up_bites_of_items \
    ( rest_wh, items_wh,  rest_tg, items_tg,  rest_en, items_en, 
      it, punct_wh, brack_wh, pure_wh_3, hyphen_wh
    )
  err.write(f"{bites_wh = !r} {rest_wh = !r} {it = !r}\n")
  assert rest_wh == "" and it == 14
  err.write(f"{bites_tg = !r} {rest_tg = !r} {it = !r}\n")
  err.write(f"{bites_en = !r} {rest_en = !r} {it = !r}\n")

  return
  # ----------------------------------------------------------------------

if len(sys.argv) == 2 and sys.argv[1] == "ALBSI.TEST":
  test_stuff()