#! /usr/bin/python3
# Last edited on 2026-02-27 07:08:37 by stolfi

# Functions for measuring size and listing word position in a 
# parag. To be included in other python scripts.

import sys, os, re
from sys import stdout as out, stderr as err, stdin as inp
from error_funcs import arg_error, file_line_error, prog_error
from process_funcs import bash, basic_line_loop
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats

def get_chars_to_ignore(enc, unit):
  # Returns a set of characters that is to be ignored when 
  # measuring the size of a parag or computing the positions of 
  # a word or pattern in it.
  ignored_chars = set()
  if enc == "chu":
    if unit != "ch": arg_error(f"invalid combo {enc = !r} {unit = !r}")
    # Read tables of chinese character sets:
    set_dir = "langbank/chin"
    ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl")
    ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl")
    ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl")
    ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-punct.tbl")
    ignored_chars |= read_chinese_char_set(f"{set_dir}/utf8-blank.tbl")
  elif enc == "pys":
    if unit != "ps": arg_error(f"invalid combo {enc = !r} {unit = !r}")
    ignored_chars |= set("-,.;:")
  elif enc == "eva":
    if unit == "ec":
      ignored_chars |= set("-,.")
    elif unit == "wc":
      # We don't need any ignored chars set.
      pass
    elif unit == "wp":
      # We ignore commas:
      ignored_chars |= set(",")
    else:
      arg_error(f"invalid combo {enc = !r} {unit = !r}")
  else:
    arg_error(f"invalid input encoding '{enc}'")
  return ignored_chars
  # ----------------------------------------------------------------------
  
def get_parsing_patterns(enc, unit):  
  # Returns 
  # 
  #   {pat_line} RE pattern matching a data line, where group 1 
  #       is the locator (minus '<>') and group 2 is the text.
  #
  #   {pat_unit} RE pattern matching one unit in the raw text.
  #
  #   {pat_junk} RE pattern matching any char or substring that should be
  #       deleted before parsing the raw text iinto units.
  #
  #   {pat_sepa} RE pattern matching one any char that serves 
  #       as unit separator in the raw text.
  #
  #   {clean_sepa} String that should precede, follow,
  #       and separate units in clean text.
  #    
  # The patterns {pat_junk} and/or {pat_sepa} mau be {None}
  # if there are supposed to be no chars in that role.
  # 
  # If {enc} is "chu", the {loc} must be "b{sec}.{subsec}.{lseq}" and
  # the {text} must be a string of hanzi (Chinese chars) in Unicode
  # encoding.
  #
  # If {enc} is "eva" the, the {loc} must be "f[0-9]*[rv][0-9]*" and
  # text must be a string of EVA chars [?a-z] or EVA punctuation [-,.]
  # with optional "<%>" prefix and "<$>" suffix.
  #
  # The {unit} specifies what constitutes a unit of the text. It can be:
  #
  #   "ch" (for {enc="chu"): a Chinese character.
  #   
  #   "ps" (for {enc="pys"): a pinyn syllable.
  # 
  #   "ec" (for {enc="eva"): an EVA character in [?a-z].
  #
  #   "wc" (for {enc="eva"): a word, considering comma like period.
  #
  #   "wp" (for {enc="eva"): a word, ignoring commas.
  # 

  if enc == "chu" or enc == "pys":
    pat_loc = r"<(b[1-3][.][0-9][.][0-9][0-9][0-9])>"
    if enc == "chu":
      if unit != "ch": arg_error(f"invalid combo {enc = !r} {unit = !r}")
      pat_unit = r"."               # Each char is a unit.
      pat_ipu = r"[：。，\［\］]"     # Ideographic punctuation.
      pat_ann = r"[（][^（）]*[）]"  # Apocriphal annotations (ideographic parens).
      pat_sepa = None               # There are no unit separators on input.
      clean_sepa = ""               # No separators in cleaned text.
    elif enc == "pys":
      if unit != "ps": arg_error(f"invalid combo {enc = !r} {unit = !r}")
      pinyin_cons = r"b-df-hj-np-tv-z" # Pinyin consonants.
      pinyin_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ"
      pat_syl = f"[{pinyin_cons}]*[{pinyin_vows}]+(r|ng|n|)"
      pat_unit = f"{pat_syl}"       # A unit is a pinyin syllable.  
      pat_ipu = r"[-,.;:'*]"        # Pinyin punctuation.
      pat_ann = r"[(][^()]*[)]"     # Apocriphal annotations (ascii parens).
      pat_sepa = r"[- .,']"         # Any of these serves as a unit separator on input.
      clean_sepa = ' '              # Single blank separator in cleaned text.
    pat_junk = f"{pat_ipu}|{pat_ann}"  # Junk to be deleted.
  elif enc == "eva":
    pat_loc = r"<(f[0-9]+[rv][0-9]*[.][0-9]+)>"
    if unit == "ec":
      pat_unit = r"[a-z?]"          # Unit is a single EVA char.
      pat_junk = r"[-,. ]"          # Any of these should be deleted.
      pat_sepa = None               # There are no unit separators on input.
      clean_sepa = ''               # No unit separator in cleaned text.
    elif unit == "wc":
      pat_unit = r"[a-z?]+"         # Unit is one or more EVA chars.
      pat_junk = r"[ ]"             # Only blanks can be just deleted
      pat_sepa = r"[-,.]"           # Any of these serves as a unit separator on input.
      clean_sepa = '.'              # Unit separator is '.' in cleaned text.
    elif unit == "wp":
      pat_unit = r"[a-z?]+"         # Unit is one or more EVA chars.
      pat_junk = r"[, ]"            # Commas too can be deleted in input text.
      pat_sepa = r"[-.]"            # Periods and line break separate units on input.
      clean_sepa = '.'              # Unit separator is '.' in cleaned text.
    else:
      arg_error(f"invalid combo {enc = !r} {unit = !r}")
      assert False
  pat_line = f"{pat_loc}[ ]+(.*)\n"
  return pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa
  # ----------------------------------------------------------------------

def average_word_size(unit):
  # Nominal average number of {unit}s for a Chinese ideogram word.
  if unit == "ch" or unit == "ps":
    return 1.0
  elif unit == "wc":
    return ???

def split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error):
  # Splits the raw {text}, assumed to be of the type specified by {enc},
  # into units of the type specified by {unit}.
  # 
  # Returns a list {units} with the units, and a count {ct_sepa} of
  # chars in {text} that were matched by {pat_sepa} (and thus not counted as units).
  #
  # Characters matched by {pat_junk} are deleted and not counted.
  #
  # Currenly the {enc} must be "chu" (Chinese characters in Unicode),
  # "pys" (pinyin with separated syllables in Unicode), or "eva"
  # (EVA-encoded Voynichese).
  #
  # The {unit} must be either "ch", "ps", "ec", "wc", or "wp",
  # as described under {get_parsing_patterns}
  # 
  # If {enc} is "eva", also deletes inline comments, alignmment and
  # parag markers, and paired ligature braces '{}'; turns weirdo codes
  # into '?'; maps everyting to lowercase, all this before splitting
  # into units and counting the punctuation.
  #
  # After that cleanup, the parsing into units uses 
  # {pat_unit} and {pat_sepa}.
  
  # Remove junk not counted as punctuation:
  if enc == "chu":
    # Nothing special to remove
    pass
  elif enc == "pys":
    # Nothing special to remove
    pass
  elif enc == "eva":
    # First remove junk that should not be there, just in case:
    text = re.sub(r"[<][!][^<>]*[>]", "", text)
    text = re.sub(r"[«=» ]", "", text)
    text = re.sub(r"[{]([^{}]*)[}]", r"\1", text)
    text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text)
    text = re.sub(r"^<[%]>", "", text)
    text = re.sub(r"<[$]>$", "", text)
    # Map all to lowercse:
    text = text.lower()
  else:
    arg_error(f"invalid encoding {enc = !r}")
    
  # Remove simple junk chars:
  if pat_junk is not None: text = re.sub(pat_junk, '', text)

  # Now loop on units and separators:
  ct_sepa = 0
  units = []
  
  while len(text) > 0:
    if pat_sepa is not None:
      m = re.match(pat_sepa, text)
      if m is not None:
        assert m.start() == 0
        n = m.end()
        assert n > 0, "pattern {pat_sepa} matched empty"
        ct_sepa += n
        text = text[n:]
        continue
    m = re.match(pat_unit, text)
    if m is not None:
      assert m.start() == 0
      n = m.end()
      assert n > 0, "pattern {pat_unit} matched empty"
      units.append(text[0:n])
      text = text[n:]
      continue
    data_error(f"invalid chars = '{text}'")
    
  return units, ct_sepa
  # ----------------------------------------------------------------------

def list_occurrences(word, units, clean_sepa, unit, data_error):
  # Returns a list of the occurrences of {word} in
  # the text that consists of the list of {units} of type {unit}
  # preceded, separated, and terminated by {clean_sepa}.
  
  if len(units) == 0: return []
  clean_text = clean_sepa + clean_sepa.join(units) + clean_sepa
  if unit == "ch":
    m = re.search(r"[\000-\377]|[　：。，（）]", clean_text)
    if m is not None: data_error(f"invalid character '{m.group(0)}' in hanzi text")
    oclist = list_ch_occurrences(word, clean_text)
  elif unit == "ps":
    m = re.search(r"[.,:;()*!?]", clean_text)
    if m is not None: data_error(f"invalid character '{m.group(0)}' in pinyin text")
    oclist = list_ps_occurrences(word, clean_text)
  elif unit == "ec":
    m = re.search(r"[^a-z?]", clean_text)
    if m is not None: data_error(f"invalid character '{m.group(0)}' in word-less EVA text")
    oclist = list_ec_occurrences(word, clean_text)
  elif unit == "wc" or unit == "wp":
    m = re.search(r"[^.a-z?]", clean_text)
    if m is not None: data_error(f"invalid character '{m.group(0)}' in word-split EVA text")
    oclist = list_wc_or_wp_occurrences(word, clean_text)
  else:
    arg_error(f"invalid {unit = !r}")
  return oclist
  # ----------------------------------------------------------------------

def list_ch_occurrences(word, clean_text): 
  # Finds occurrences of {word} as string in the Unicode CHU-8 hanzi
  # {clean_text}, returning their positions. The text {clean_text} should have no
  # iso-latin-1 chars or hanzi punctuation.
  #
  # Positions are measured in hanzi chars.
  oclist = list(( p.start() for p in re.finditer(word, clean_text) ))
  return oclist
  # ----------------------------------------------------------------------
 
def list_ps_occurrences(word, clean_text):
  # Seach {word} in {clean_text} matching only whole syllables. The
  # {clean_text} text should have single ' ' before, between, and after
  # each syllable.
  #
  # Positions are measured in pinyin syllables.
  
  debug = False
  assert clean_text[0] == ' ' and clean_text[-1] == ' ', "cleanup failed"
  
  # We must loop on syllables:
  rest = clean_text
  word = r"\b" + word + r"\b"  # Ensure that {word} matches only whole sylls.
  oclist = []
  pref = ""
  while True:
    if debug: err.write(f"!*    '{pref}|{rest}'\n") 
    m = re.search(word, rest)
    if m is None: break
    ix = m.start(); fx = m.end()
    if debug: err.write(f"!*    {ix = } {fx = }\n") 
    assert ix < len(rest)
    # If {word} matched just before ' ', adjust {ix} to exclude the ' ':
    if rest[ix] == ' ': ix = ix + 1 
    # If {word} matched just after ' ', adjust {fx} to exclude the ' ':
    if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 
    if debug: err.write(f"!*    adjusted {ix = } {fx = }\n") 
    # Make sure that match is whole words:
    assert ix > 0 and ix < len(rest) and rest[ix-1] == ' '
    assert fx > ix, "the pattern {word = !r} matched the empty string"
    assert fx < len(rest) and rest[fx] == ' '
    # Count whole words before occurrence of {word}:]
    pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:]
    if debug: err.write(f"!*    '{pref}[{core}]{rest}'\n") 
    oc = len(re.findall(r"[ ]", pref)) - 1
    if debug: err.write(f"!*    {oc = }'\n") 
    oclist.append(oc)
    pref = pref + core
  return oclist
  # ----------------------------------------------------------------------

def list_ec_occurrences(word, clean_text):
  # Finds occurrences of {word} as string in the EVA {clean_text}, returning
  # their positions. Assumes that all blanks and EVA puntuation [-,.]
  # have been deleted.
  #
  # Positions are measured in EVA character counts

  oclist = list(( p.start() for p in re.finditer(word, clean_text) ))
  return oclist
  # ----------------------------------------------------------------------
      
def list_wc_or_wp_occurrences(word, clean_text): 
  # Finds occurrences of {word} in the EVA {clean_text}, returning their
  # positions.
  # 
  # Assumes that blanks, other EVA junk, and unwanted separators ([,]
  # for "wp") have been deleted, relevant unit separators ([-,.] for
  # "wc", [-.] for "wp") have been mapped to '.', and there is a single
  # '.' before, between, and after every unit,
  # 
  # Positions are measured in EVA words. Unmatched EVA word suffixes or
  # prefizes are counted as 0.5 word.

  assert clean_text[0] == '.' and clean_text[-1] == '.', "cleanup failed"

  debug = False
  # We must loop on words:
  rest = '.' + clean_text + '.'
  oclist = []
  pref = ""
  while True:
    if debug: err.write(f"!*    '{pref}|{rest}'\n") 
    m = re.search(word, rest)
    if m is None: break
    ix = m.start(); fx = m.end()
    if debug: err.write(f"!*    {ix = } {fx = }\n") 
    assert ix < len(rest)
    # If {word} matched just before '.', adjust {ix} to exclude the '.':
    if rest[ix] == '.': ix = ix + 1 
    # If {word} matched just after ' ', adjust {fx} to exclude the ' ':
    if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 
    if debug: err.write(f"!*    adjusted {ix = } {fx = }\n") 
    # Count whole words before occurrence of {word}:]
    pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:]
    if debug: err.write(f"!*    '{pref}[{core}]{rest}'\n") 
    oc = len(re.findall(r"[.]", pref)) - 1
    if debug: err.write(f"!*    {oc = }'\n") 
    # If {word} matched only a suffix, count the prefix as half a word:
    if pref != '' and pref[-1] != '.': oc += 0.5
    oclist.append(oc)
    pref = pref + core
  return oclist
  # ----------------------------------------------------------------------

def test_stuff():

  def data_error(msg):
    err.write(f"** {msg}\n"); assert False
    # ....................................................................
  
  for enc, unit in ( ("eva","wc",), ("eva","wp",), ("eva","ec",), ("chu","ch",), ("pys", "ps",) ):
  
    err.write(f"!* counting '{unit}' units in {enc} text\n")
 
    ignored_chars = get_chars_to_ignore(enc, unit)
    err.write(f"!*   {ignored_chars = !r}\n")
    
    pat_line, pat_unit, pat_junk, pat_sepa, clean_sepa = \
      get_parsing_patterns(enc, unit)
    
    if enc == "eva":
      text = "foo.bar.,baz.-qofoo,barrifoo"
      words = [ "foo", "bar", ]
      if unit == "wc" or unit == "wp":
        words += [ "[.]foo", "foo[.]", "[.]foo[.]", "foo[.]bar", ]
        clean_sepa = '.'
      elif unit == "ec":
        clean_sepa = ''
    elif enc == "pys":
      text = " tóng yè wèi. zhǔ zhì è zhe zhǔ zhì yīn. pí, zhǔ wǔ zhì, shāng ǔ. ǔ."
      words = [ "zhǔ", "zhǔ zhì" ]
      assert unit == "ps"
      clean_sepa = ' '
    elif enc == "chu":
      text = "白石英：味甘微温。主治消渴，阴痿不足，欬逆白石，青石黑石脂等：味甘平。主治黄疸，泄利"
      words = [ "白石", "主治" ]
      assert unit == "ch"
      clean_sepa = ''
      
    err.write(f"!*   raw text = '{text}')\n")
    units, ct_sepa = \
      split_text_into_units(text, enc, unit, pat_unit, pat_junk, pat_sepa, data_error)
    err.write(f"!*   {ct_sepa = } {units = !r}\n")
    err.write("\n")

    for word in words:
      err.write(f"!*     looking up '{word}'\n")
      oclist = list_occurrences(word, units, clean_sepa, unit, data_error)
      err.write(f"!*     found = {oclist}\n")
      err.write("\n")
  return
  # ----------------------------------------------------------------------

if len(sys.argv) > 0 and sys.argv[1] == "test":
  test_stuff()