#! /usr/bin/python3
# Last edited on 2026-03-29 20:47:27 by stolfi

# Functions for measuring size and listing word position in a 
# parag. To be included in other python scripts.

import sys, os, re
from sys import stdout as out, stderr as err, stdin as inp
from error_funcs import arg_error, file_line_error, prog_error
from process_funcs import bash, basic_line_loop
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats
from math import sqrt, hypot, exp, log, pi, inf, nan, floor, ceil

def clean_up_raw_text(text, utype, data_error):
  # Cleans up the raw {text} from an SBJ or SPS transcription file according to {utype}
  # for use in Note 077 analysis.
  #
  # See {clean_up_bencao_raw_text} for the meaning of {utype}s "ch" and "ps".
  # See {clean_up_starps_raw_text} for the meaning of {utype}s "ec", "wp", and "wc".
  # 
  if utype == "ch" or utype == "ps":
    text = clean_up_bencao_raw_text(text, utype, data_error)
  elif utype == "ec" or utype == "wc" or utype == "wp":
    text, head, tail = clean_up_starps_raw_text(text, utype, data_error)
  return text
  # ----------------------------------------------------------------------

def normalize_raw_text(text, utype, data_error):
  # Modifies a raw {text} from an SBJ or SPS transcription
  # file according to {utype} for use in Note 077 analysis, by replacing
  # certain substrings that are assumed to be scribal errors,
  # abbreviations, or non-significant variants.
  #
  # See {normalize_bencao_raw_text} for the meaning of {utype}s "ch" and "ps".
  # See {normalize_starps_raw_text} for the meaning of {utype}s "ec", "wp", and "wc".
  # 
  if utype == "ch" or utype == "ps":
    text = normalize_bencao_raw_text(text, utype, data_error)
  elif utype == "ec" or utype == "wc" or utype == "wp":
    text = normalize_starps_raw_text(text, utype, data_error)
  return text
  # ----------------------------------------------------------------------

def clean_up_starps_raw_text(text, utype, data_error):
  # Cleans up the raw {text} from an SPS transcription file according to {utype}
  # for use in Note 077 analysis.
  #
  # The function returns the cleaned {text} and booleans {head} and
  # {tail} that specify whether the line was a parag head line or parag
  # tail line, respectively. Note that both may be true for a one-line
  # parag.
  #
  # INPUT TEXT EXPECTED
  # 
  #   The raw {text} given must be a non-empty string that may contain
  #   any of these:
  #
  #     Prefix "<%>" marking the line as the head of a paragraph.
  #
  #     Prefix [«=»] to indicate the alignment of the start of the line
  #       relative to the left rail.
  #
  #     EVA letters [A_Za-z].
  #
  #     Invalid EVA code '?'.
  #
  #     Weirdo codes "&{NNN};" (possibly without the ';') 
  #       where {NNN} is three decimal digits.
  #
  #     Ligatures consisting of two or more EVA letters, '?', or
  #       weirdo codes enclosed in braces "{...}".
  #
  #     Word separators [-.,].
  #
  #     Suffix [«=»] to indicate the alignment of the end of the line
  #       relative to the right rail.
  #
  #     Suffix "<$>" to mark the line as the tail of a parag.
  #
  #     Inline comments "<!...>", including various special comments
  #       to indicate stars in the margin, wide linegaps, figure intrusions.
  #       vellum folds, etc. These may appear anywhere in the line
  #       including before or after all the {text} and inside other markup
  #       tags.
  #
  #   Example:
  #
  #     "={Ch}eeo,daiin.{Sh}eedy&162&211am.{Ch}odal.ol{Ch}edy«<!WGN><$>"
  #     "<%><!S09><!WGP>=w{Ch}edairs.oeail.{Ch}otaropdaiin.otol,dair.aiir,aim="
  #     "={Ch}ol.oeedy.keedy.{Ch}{Ch}eky{air.ar.{Sh}ol.{Ch}edy.ot{Ch}edy.{Qo}ty=" 
  #
  #   Note the parag break between the first and second text.
  #     
  # CLEANUP PERFORMED
  # 
  #   For all {utype}s, the cleanup entails:
  #     
  #     Removing all inline comments.
  #     
  #     Removing all markup, including the braces [{}], parag markers "<%>" and "<$>",
  #     and rail alignment markers [«=»].
  #
  #     Mapping 'w' and 'z', used in my transcription to encode hooked puffs,
  #     to 'p' and 'f', respctively.
  #
  #     Mapping the uncertain reading charaters [bj] to '?'.
  #
  #     Mapping all weirdo codes to '?'.
  #
  #     Mapping everything to lowercase. 
  #
  #   Moreover, 
  #
  #     If {utype} is "ec", all punctuation [-,.] is deleted.
  #
  #     If {utype} is "wp", every ',' is deleted, and '-' is mapped to '.'
  #
  #     If {utype} is "wc", every ',' and '-' are mapped to '.'
  #   
  # CLEANED TEXT
  # 
  #   After this cleanup, if {utype} is "ec", the text must  have only
  #   characters [ac-fhik-tvxy?]. Example:
  #
  #     "choloeedykeedychchekycheod?eyke??edeedydaiiinaldair"
  #
  #   If {utype} is "wc" or "wp", the raw text may also have '.' separators. Example:
  #   
  #     "chol.oeedy.keedy.chcheky.cheod?ey.ke??edeedy.daiiin.ald.air"
  #   
  #   For more restrictions, see {check_starps_cleanup} below.
  #

  text = text.strip()
  if utype == "ec" or utype == "wc" or utype == "wp":
    # Remove inline comments:
    text = re.sub(r"[<][!][^<>]*[>]", "", text)
    
    # Map weirdo codes to '?':
    text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text)
    
    # Remove ligature marks '{}':
    text = re.sub(r"[{]([^{}]*)[}]", r"\1", text)
    
    # Map to lowercase:
    text = text.lower()
    
    # Map hooked puffs to normal ones:
    text = re.sub(r"w", "p", text)
    text = re.sub(r"z", "f", text)
    
    # Map rare uncertain characters to '?':
    text = re.sub(r"[bj]", "?", text)
      
    # Check for parag markers:
    head = ( re.search(r"^[<][%][>]", text) != None )
    tail = ( re.search(r"[<][$][>]$", text) != None )
    
    # Remove all parag markers for now:
    text = re.sub(r"[<][%$][>]", "", text)

    # Remove alignment markers:
    text = re.sub(r"^[«=»]", "", text)
    text = re.sub(r"[«=»]$", "", text)

    # Check for invalid characters in input:
    m_bad = re.search(r"[^-.,ac-ik-z?]", text)
    if m_bad is not None:
      ibeg = m_bad.start()
      data_error(f"invalid character '{text[ibeg]}' at {ibeg}")
      
    # Check for irregular spaces in input:
    m_bad = re.search(r"^[-,.]|[-,.][-,.]|[-,.]$", text)
    if m_bad is not None:
      ibeg = m_bad.start(); iend = m_bad.end()
      data_error(f"improper punctuation '{text[ibeg:iend]}' at {ibeg}")

    # Replace punctuation:
    text = re.sub(r"-", ".", text)
    if utype == "ec":
      text = re.sub(r"[,.]", "", text)
    elif utype == "wc":
      text = re.sub(r"[,]", ".", text)
    elif utype == "wp":
      text = re.sub(r"[,]", "", text)
    else:
      assert False # Should not happen.
  else:
    arg_error(f"invalid {utype = !r}")
    
  check_starps_cleanup(text, utype, data_error)
  
  return text, head, tail
  # ----------------------------------------------------------------------

def clean_up_bencao_raw_text(text, utype, data_error):
  # Cleans up the raw {text} from an SBJ transcription file according to {utype}
  # for use in Note 077 analysis. Returns the cleaned {text}.
  #
  # INPUT TEXT EXPECTED
  #
  #   For any {utype} the input text may contain editorial annotations and markup.
  #   The details depend on {utype}:
  #
  #     "ch": the input raw {text} must consist of simplified hanzi
  #     (Chinese characters) in Unicode, with ideographic punctuation
  #     and markup characters [］［；：，。]. It is assumed that
  #     editorial annotations delimited by ideographic parens（...）were
  #     moved to #-comments, but some tags like （女子）remain, and they
  #     are not removed. The raw {text} should not have embedded ASCII
  #     blanks or other ASCII chars. Example:
  #
  #       "冬葵子：［味］甘寒。［主治］五藏六腑寒热羸瘦。［久服］坚骨长肌肉。（草头）"
  #
  #     "ps": the raw {text} must be a string of isolated pinyin with
  #     syllables separated by blanks or ASCII punctuation. There may be
  #     tags like (nǚ zǐ) in ASCII parentesis, other editrial comments
  #     should have been removed. The {text} may constain apostrophes,
  #     asterisks, and other ASCII markup, . Example:
  #
  #       "mǔ lì: [wèi] xián píng. [zhǔ zhì] shāng hán hán*(cǎo tóu) rè. [shēng] chí zé."
  # 
  # CLEANUP
  #
  #   Cleanup for both {utype}s implies:
  #
  #     Replacing all punctuation by spaces.
  #
  #  If {utype} is "ps", the cleanup then entails removing 
  #  extrs spaces, leavingonly one space between syllables.
  #
  # CLEANED TEXT
  # 
  #   After this cleanup, if {utype} is "ch", the raw text must
  #   consist entirely of simplified hanzi. Example:
  #
  #     "冬葵子味甘寒主治五藏六腑寒热羸瘦久服坚骨长肌肉"
  # 
  #   If {utype} is "ps", the text must consist enturely of pinyin
  #   syllables, separated by single spaces. Example:
  # 
  #       "mǔ lì wèi xián píng zhǔ zhì shāng hán hán rè shēng chí zé"
  #
  if utype == "ch":
    # # Remove annotations (enclosed in ideographic parens):
    # text = re.sub(r"[（][^（）]*[）]", "", text)
    # Remove all punctuation: */
    text = re.sub(r"[］［；：，。（）]", "", text)
  elif utype == "ps":
    # Remove asterisks:
    text = re.sub(r"[*]", "", text)
    # # Remove annotations enclosed in ASCII parens:
    # text = re.sub(r"[(][^()]*[)]", "", text)
    # Remove all punctuation, leave normalized blanks: */
    text = re.sub(r"[][',.:;()]", " ", text)
    text = re.sub(r"[ ][ ]+", " ", text)
    text = text.strip()
  else:
    arg_error(f"invalid text unit type {utype = !r}")
    
  check_bencao_cleanup(text, utype, data_error)

  return text
  # ----------------------------------------------------------------------

def normalize_starps_raw_text(text, utype, data_error):
  # Normalizes the raw {text} from an SPS transcription file according to {utype}
  # for use in Note 077 analysis. Returns the normalized text
  #
  # For all {utype}s, the normalization entails fixing assumed scribal
  # mistakes and assumed non-significant handwriting variation,
  # by mapping
  #
  #     'g','m' to 'il'
  #     'u'     to 'n'
  #     'hh'    to 'he'
  #     'ih'    to 'ch'
  #     'iGh'   to 'cGh' for any gallows G.
  # 
  if utype == "ec" or utype == "wc" or utype == "wp":
    # Reduce presumed handwriting variants:
    text = re.sub(r"g", "m", text)
    text = re.sub(r"u", "n", text)

    # Correct presumed scribal and transcription errors and abbreviations:
    text = re.sub(r"ir", "iin", text)
    text = re.sub(r"m", "il", text)
    text = re.sub(r"hh", "he", text)
    text = re.sub(r"ih", "ch", text)
    text = re.sub(r"i([ktpf])h", r"c\1h", text)
  else:
    arg_error(f"invalid {utype = !r}")
    
  return text
  # ----------------------------------------------------------------------

def normalize_bencao_raw_text(text, utype, data_error):
  # Normalizes the raw {text} from an SBJ transcription file according to {utype}
  # for use in Note 077 analysis. Returns the normalized {text}.
  #
  # The input text {text} must have the punctuation, including ideographic
  # brackets '' around keywords 味, 主治, 久服, etc.
  # 
  # Cleanup for both {utype}s implies:
  #
  #   Deleting entry fields that apparently are not transcribed in the SPS:
  #
  #     ［味］...   [wèi]     "taste and warmth"
  #     ［一名］... [yī míng] "another name"
  #     ［生 ］...  [shēng]   "place of origin"
  # 
  #   Deleting the sub-entry "鸡白蠹：肥猪。" = "jī bái dù: féi zhū." 
  #   which is a vet/farming use.
  #
  #   Normalizing some keys:
  #  
  #     ［治］     [zhì]        to ［主治］ [zhǔ zhì]
  #     ［主］     [zhǔ]        to ［主治］ [zhǔ zhì]
  #     ［久服］之 [jiǔ fú] zhī  to ［久服］ [jiǔ fú]
  #     ［久食］   [jiǔ shí]     to ［久服］ [jiǔ fú]
  #  
  #
  if utype == "ch":
    check_bencao_cleanup(text, utype, data_error)
    # Remove fields apparently omitted from the SPS:
    text = re.sub(r"\［(味|一名|生)\］[^］［]*", "", text)
    text = re.sub(r"鸡白蠹[：，]?肥猪[。]?", "", text) # Vet/farming use.
    # Regularize field keywords:
    text = re.sub(r"\［治\］", "［主治］", text)    # On b1.1.002.
    text = re.sub(r"\［主\］", "［主治］", text)
    text = re.sub(r"\［久服之\］", "［久服］", text) # On <b3.3.078>.
    text = re.sub(r"\［久食\］", "［久服］", text)
  elif utype == "ps":
    # Remove entries apparently omitted from the SPS:
    text = re.sub(r"\[(wèi|yī[ ]+míng|shēng)\][^][]*", "", text)
    text = re.sub(r" *\bjī[ ]+bái[ ]+dù[,:]?[ ]*féi[ ]+zhū\b[.]? *", " ", text) # Vet/farming use.
    # Regularize field keywords:
    text = re.sub(r"\[zhì]\]", "[zhǔ zhì]", text) # On b1.1.002.
    text = re.sub(r"\[zhǔ]\]", "[zhǔ zhì]", text) # On b1.1.002.
    text = re.sub(r"\[jiǔ[ ]+fú[ ]+zhī\]", "[jiǔ fú]", text) # On <b3.3.078>.
    text = re.sub(r"\[jiǔ[ ]+shí\]", "[jiǔ fú]", text)
  else:
    arg_error(f"invalid text unit type {utype = !r}")

  return text
  # ----------------------------------------------------------------------

def check_starps_cleanup(text, utype, data_error):
  # Checks whether {text} is a valid "clean" SPS text according
  # to the given {utype}. Calls {data-error} if not.
  
  # Check for invalid characters:
  if utype == "ec":
    m_bad = re.search(r"[^ac-ik-vxy?]", text)
  elif utype == "wc" or utype == "wp":
    m_bad = re.search(r"[^.ac-ik-vxy?]", text)
  else:
    arg_error(f"invalid {utype = !r}")
  if m_bad is not None:
    ibeg = m_bad.start()
    data_error(f"invalid text char '{text[ibeg]}' at pos {ibeg}")

  if utype == "wc" or utype == "wp":
    # Check for improper spacing:
    pat_bad_punc = r"^[.]|[.][.]|[.]$"
    m_bad = re.search(pat_bad_punc, text)
    if m_bad != None:
      ibeg = m_bad.start()
      data_error(f"improper punctuation '{text[ibeg]}' at pos {ibeg}")
  
  return
  # ----------------------------------------------------------------------

def check_bencao_cleanup(text, utype, data_error):
  # Checks whether {text} is a valid "clean" SBJ text according
  # to the given {utype}. Calls {data-error} if not.
  #
  # !!! Currently does not do a thorough check. Impreove! !!!

  if utype == "ch":
    # Check for ASCII characters:
    m_bad = re.search(r"[\001-\377]", text)
  elif utype == "ps":
    # Check for characters not valid in pinyin:
    pn_cons = r"b-df-hj-np-tv-z" # Pinyin consonants.
    pn_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ"
    pn_letr = pn_cons + pn_vows
    m_bad = re.search(f"[^ {pn_letr}]", text)
  if m_bad is not None:
    ibeg = m_bad.start()
    data_error(f"invalid text char '{text[ibeg]}' at {ibeg}")

  return
  # ----------------------------------------------------------------------
  
def get_parsing_patterns(utype):  
  # Returns 
  # 
  #   {pat_line} RE pattern matching a data line, where group 1 
  #       is the locator (minus '<>') and group 2 is the text.
  #
  #   {pat_unit} RE pattern matching one unit in the raw text.
  #
  #   {pat_sepa} RE pattern matching any char that serves 
  #       as unit separator in the clean text.
  #
  #   {clean_sepa} String that should separate units in clean text.
  #    
  # The pattern {pat_sepa} may be {None}
  # if there are supposed to be no chars in that role.
  # 
  # See {clean_up_starps_raw_text} and {clean_up_raw_bencao_text} for
  # the semantics of {utype}.
  # 
  # The locus ID format {loc} depends on {utype}: 
  #
  #   "ch" or "ps": the {loc} must be "b{sec}.{subsec}.{lseq}".
  #
  #   "ec", "wp", or "wc": the {loc} must be "f[0-9]*[rv][0-9]*".
  #
  # Unit of text size/position:
  #
  #     "ch": The units of counting is one hanzi.
  #
  #     "ps": The unit of counting is the syllable.
  # 
  #     "ec": a single eva character [?a-z].
  #
  #     "wc" and "wc": one EVA token, a string [?a-z]+ delimited by '.'
  # 
  if utype == "ch" or utype == "ps":
    pat_loc = r"<(b[1-3][.][0-9][.][0-9][0-9][0-9])>"
    if utype == "ch":
      pat_unit = r"."               # Each char is a unit.
      pat_sepa = None               # There are no unit separators on input.
      clean_sepa = ""               # No separators in cleaned text.
    elif utype == "ps":
      pinyin_cons = r"b-df-hj-np-tv-z" # Pinyin consonants.
      pinyin_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ"
      pat_syl = f"[{pinyin_cons}]*[{pinyin_vows}]+(r|ng|n|)"
      pat_unit = f"{pat_syl}"       # A unit is a pinyin syllable.  
      pat_sepa = r"[ ]"               # Single blank is separator in clean text.
      clean_sepa = ' '              # Single blank is separator in cleaned text.
  elif utype == "ec" or utype == "wc" or utype == "wp":
    pat_loc = r"<(f[0-9]+[rv][0-9]*[.][0-9]+)>"
    if utype == "ec":
      pat_unit = r"[a-z?]"          # Unit is a single EVA char.
      pat_sepa = None               # There are no unit separators on input.
      clean_sepa = ''               # No unit separator in cleaned text.
    elif utype == "wc" or utype == "wp":
      pat_unit = r"[a-z?]+"         # Unit is one or more EVA chars.
      pat_sepa = r"[.]"             # In clean text only '.' is separator.
      clean_sepa = '.'              # Unit separator is '.' in cleaned text.
    else:
      arg_error(f"invalid {utype = !r}")
  pat_line = f"{pat_loc}[ ]+(.*)"

  return pat_line, pat_unit, pat_sepa, clean_sepa
  # ----------------------------------------------------------------------
   
def hanzi_per_unit(utype):
  # Nominal average number of units of type {utype} for a Chinese ideogram word.
  if utype == "ch" or utype == "ps":
    return 1.0000
  elif utype == "wc":
    return 0.7952
  elif utype == "wp":
    return 0.8994
  elif utype == "ec":
    return 0.1568
  else:
    assert False, f"invalid {utype = !r}"
  # ----------------------------------------------------------------------

def split_text_into_units(text, utype, pat_unit, pat_sepa, data_error):
  # Splits cleaned {text} into units of the type specified by {utype}.
  # Assumes that the txt was cleaned according to {utype}.
  # 
  # Returns a list {units} with the units, and a count {ct_sepa} of
  # chars in {text} that were matched by {pat_sepa} (and thus not counted as units).
  #
  # Currenly the {utype} must be "ch" (Chinese characters in Unicode),
  # "ps" (pinyin with separated syllables in Unicode), "ec", "wc", or "wp"
  # (EVA-encoded Voynichese).
  #
  
  # Loop on units and separators:
  ct_sepa = 0
  units = []
  
  while len(text) > 0:
    if pat_sepa is not None:
      m = re.match(pat_sepa, text)
      if m is not None:
        assert m.start() == 0
        n = m.end()
        assert n > 0, "pattern {pat_sepa} matched empty"
        ct_sepa += n
        text = text[n:]
        continue
    m = re.match(pat_unit, text)
    if m is not None:
      assert m.start() == 0
      n = m.end()
      assert n > 0, "pattern {pat_unit} matched empty"
      units.append(text[0:n])
      text = text[n:]
      continue
    data_error(f"invalid chars = '{text}'")
    
  return units, ct_sepa
  # ----------------------------------------------------------------------

def list_occurrences(word, units, clean_sepa, utype, data_error):
  # Returns a list of the occurrences of {word} in
  # the text that consists of the list {units} of units of type {utype}
  # preceded, separated, and terminated by {clean_sepa}.
  # 
  # Note that the bracketing {clean_sepa} make the text not valid.
  # They are added to simplify matching whole words.
  #
  # The {word} may be an RE pattern and/or may include {clean_sepa}
  # to control matches.
  #
  # !!! Misses overlapping occurrences. Is that OK? !!!
  
  if len(units) == 0: return []
  btext = clean_sepa + clean_sepa.join(units) + clean_sepa
  if utype == "ch":
    oclist = list_ch_occurrences(word, btext)
  elif utype == "ps":
    oclist = list_ps_occurrences(word, btext)
  elif utype == "ec":
    oclist = list_ec_occurrences(word, btext)
  elif utype == "wc" or utype == "wp":
    oclist = list_wc_or_wp_occurrences(word, btext)
  else:
    arg_error(f"invalid {utype = !r}")
  return oclist
  # ----------------------------------------------------------------------

def list_ch_occurrences(word, btext): 
  # Finds occurrences of {word} as string in the Unicode CHU-8 hanzi
  # {btext}, returning their positions. The text {btext} should have no
  # iso-latin-1 chars or hanzi punctuation.
  #
  # Positions are measured in hanzi chars.
  
  oclist = list(( p.start() for p in re.finditer(word, btext) ))
  return oclist
  # ----------------------------------------------------------------------
 
def list_ps_occurrences(word, btext):
  # Seach {word} in {btext} matching only whole syllables. The
  # {btext} text should have single ' ' before, between, and after
  # each syllable.
  #
  # Positions are measured in pinyin syllables.
  
  debug = False
  assert btext[0] == ' ' and btext[-1] == ' ', "bug padding"
  
  # We must loop on syllables:
  rest = btext
  word = r"\b" + word + r"\b"  # Ensure that {word} matches only whole sylls.
  oclist = []
  pref = ""
  while True:
    if debug: err.write(f"!*    '{pref}|{rest}'\n") 
    m = re.search(word, rest)
    if m is None: break
    ix = m.start(); fx = m.end()
    if debug: err.write(f"!*    {ix = } {fx = }\n") 
    assert ix < len(rest)
    # If {word} matched just before ' ', adjust {ix} to exclude the ' ':
    if rest[ix] == ' ': ix = ix + 1 
    # If {word} matched just after ' ', adjust {fx} to exclude the ' ':
    if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 
    if debug: err.write(f"!*    adjusted {ix = } {fx = }\n") 
    # Make sure that match is whole words:
    assert ix > 0 and ix < len(rest) and rest[ix-1] == ' '
    assert fx > ix, "the pattern {word = !r} matched the empty string"
    assert fx < len(rest) and rest[fx] == ' '
    # Count whole words before occurrence of {word}:]
    pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:]
    if debug: err.write(f"!*    '{pref}[{core}]{rest}'\n") 
    oc = len(re.findall(r"[ ]", pref)) - 1
    if debug: err.write(f"!*    {oc = }'\n") 
    oclist.append(oc)
    pref = pref + core
  return oclist
  # ----------------------------------------------------------------------

def list_ec_occurrences(word, btext):
  # Finds occurrences of {word} as string in the EVA {btext}, returning
  # their positions. Assumes that all blanks and EVA puntuation [-,.]
  # have been deleted.
  #
  # Positions are measured in EVA character counts

  oclist = list(( p.start() for p in re.finditer(word, btext) ))
  return oclist
  # ----------------------------------------------------------------------
      
def list_wc_or_wp_occurrences(word, btext): 
  # Finds occurrences of {word} in the EVA {btext}, returning their
  # positions.
  # 
  # Assumes that blanks, other EVA junk, and unwanted separators ([,]
  # for "wp") have been deleted, relevant unit separators ([-,.] for
  # "wc", [-.] for "wp") have been mapped to '.', and there is a single
  # '.' before, between, and after every unit,
  # 
  # Positions are measured in EVA words. Unmatched EVA word suffixes or
  # prefizes are counted as 0.5 word.

  assert btext[0] == '.' and btext[-1] == '.', "cleanup failed"

  debug = False
  # We must loop on words:
  rest = btext
  oclist = []
  pref = ""
  while True:
    if debug: err.write(f"!*    '{pref}|{rest}'\n") 
    m = re.search(word, rest)
    if m is None: break
    ix = m.start(); fx = m.end()
    if debug: err.write(f"!*    {ix = } {fx = }\n") 
    assert ix < len(rest)
    # If {word} matched just before '.', adjust {ix} to exclude the '.':
    if rest[ix] == '.': ix = ix + 1 
    # If {word} matched just after '.', adjust {fx} to exclude the '.':
    if fx < len(rest) and rest[fx-1] == ' ': fx = fx - 1 
    if debug: err.write(f"!*    adjusted {ix = } {fx = }\n") 
    # Count whole words before occurrence of {word}:]
    pref = pref + rest[0:ix]; core = rest[ix:fx]; rest = rest[fx:]
    if debug: err.write(f"!*    '{pref}[{core}]{rest}'\n") 
    oc = len(re.findall(r"[.]", pref)) - 1
    if debug: err.write(f"!*    {oc = }'\n") 
    # If {word} matched only a suffix, count the prefix as half a word:
    if pref != '' and pref[-1] != '.': oc += 0.5
    oclist.append(oc)
    pref = pref + core
  return oclist
  # ----------------------------------------------------------------------

def find_keywords(text, kword):
  # Splits {text} at every occurrence of the pattern {kword}. Returns
  # {gaps,gsizes,hits,hsizes} where {hits} is a list of the {nh}
  # occurrences of {kword} in {text}, {gaps} is a list of the {ng=nh+1}
  # strings before, between,and after those strings, and {gsizes,hsizes}
  # are lists of the lengths of those strings.
  
  chops = re.split(f"({kword})", text)
  assert len(chops) >= 1
  ng = (len(chops) + 1) // 2   # Number of actual gaps.
  nh = ng - 1                  # Number of actual hits.
  assert len(chops) == 2*ng - 1
  
  gaps = [ chops[2*i] for i in range(nh+1) ]
  hits = [ chops[2*i + 1] for i in range(nh) ]
  gsizes = [ len(g) for g in gaps ]
  hsizes = [ len(h) for h in hits ]
  return gaps, gsizes, hits, hsizes
  # ----------------------------------------------------------------------
 
def compute_total_size_range(gsizes, hsize):
  # Computes a total size range
  # given the estimated integer ranges {gsizes[0..ng-1]} of gap sizes 
  # between certain keywords and and the estimated size range
  # {hsize} for the separating .
  ng = len(gsizes); nh = ng - 1;
  tsize_lo = nh*hsize[0]; 
  tsize_hi = nh*hsize[1]
  for egs in gsizes:
    tsize_lo += egs[0]
    tsize_hi += egs[1]
  return (tsize_lo, tsize_hi,)
  # ----------------------------------------------------------------------

def round_est_size(sz, eps_log):
  # Rounds an estimated size after adding
  # {eps_log} in log scale. Rounds down if {eps_log}
  # is negative, up otherwise.
  if eps_log < 0:
    rsz = int(floor(exp(log(sz) + eps_log)))
    assert rsz <= sz
  elif eps_log > 0:
    rsz = int(ceil(exp(log(sz) + eps_log)))
    assert rsz >= sz
  else:
    rsz = int(sz + 0.5)
  return rsz
  # ----------------------------------------------------------------------
  
def parse_size_ranges(szranges_str):
  # The list {szranges_str} must be a string consisting of {neg} items
  # separated by commas, where each item is either an integer or a pair of
  # integers separated by '..'. They are converted to a list
  # {szranges[0..neg-1]} of integer pairs {(lo,hi)}.
  #
  xitems = re.split(r"[,]", szranges_str)
  szranges = []
  for xit in xitems:
    xit = xit.strip()
    if xit != "":
      m = re.fullmatch(r"([0-9]+)[.][.]([0-9]+)", xit)
      if m != None:
        it_min = int(m.group(1))
        it_max = int(m.group(2))
        assert 1 <= it_min and it_min <= it_max, f"bad expected range {it_min}..{it_max}"
        szranges.append((it_min,it_max,))
      else:
        it_val = int(xit)
        szranges.append((it_val,it_val,))
  return szranges
  # ----------------------------------------------------------------------

def format_range(szrange):
  los = szrange[0]; his = szrange[1]
  if los == his:
    return str(los)
  else:
    return str(los) + ".." + str(his)
  # ----------------------------------------------------------------------

def format_size_ranges(szranges):
  szranges_str = ''
  for szrange in szranges:
    if szranges_str != '': szranges_str += ','
    egs_str = format_range(szrange)
    szranges_str += egs_str
  return szranges_str
  # ----------------------------------------------------------------------

def combine_gaps_and_hits(gaps, gsizes, new_gsizes, hits, hsizes, new_hsizes):
  # Concatenates {gaps} and {hits} to match the lists {new_gsizes}.
  # Puts brackets around hits.
  
  ng = len(gsizes)
  assert len(hsizes) == ng - 1
  
  nog = len(new_gsizes)
  assert len(new_hsizes) == nog - 1
  assert nog <= ng
  
  fgaps = [ '', ]
  fgsizes = [ 0 ]
  fhits = []
  fhsizes = []
  
  iog = 0
  for ig in range(ng):
    h = '' if ig == 0 else '[' + hits[ig-1] + ']'
    hs = 0 if ig == 0 else hsizes[ig-1]
    g = gaps[ig]
    gs = gsizes[ig]
    if fgsizes[iog] + hs + gs <= new_gsizes[iog]:
      fgaps[iog] += h + g
      fgsizes[iog] += hs + gs
    elif fgsizes[iog] == new_gsizes[iog]:
      fhits.append(h); fhsizes.append(hs)
      fgaps.append(g); fgsizes.append(gs)
      iog += 1
      assert iog == len(fgsizes) - 1
    else:
      err.write(f"!* {gaps = !r}\n")
      err.write(f"!* {hits = !r}\n")
      err.write(f"!* {gsizes = !r}\n")
      err.write(f"!* {hsizes = !r}\n")
      err.write(f"!* {new_gsizes = !r}\n")
      err.write(f"!* {new_hsizes = !r}\n")
      err.write(f"!* {iog = !r}\n")
      err.write(f"!* {fgaps = !r}\n")
      err.write(f"!* {fhits = !r}\n")
      err.write(f"!* {fgsizes = !r}\n")
      err.write(f"!* {fhsizes = !r}\n")
      assert False
      
  # Paranoia:
  assert len(fgaps) == nog
  assert len(fgsizes) == nog
  assert len(fhits) == nog - 1
  assert len(fhsizes) == nog - 1
  for iog in range(nog):
    assert fgsizes[iog] == new_gsizes[iog]
    if iog > 0: assert fhsizes[iog-1] == new_hsizes[iog-1]
  return fgaps, fgsizes, fhits, fhsizes
  # ----------------------------------------------------------------------

def test_stuff():

  def data_error(msg):
    err.write(f"** {msg}\n"); assert False
    # ....................................................................
  
  for utype in ( "wc", "wp", "ec", "ch", "ps" ):
  
    err.write(f"!* counting '{utype}' units\n")
 
    pat_line, pat_unit, pat_sepa, clean_sepa = \
      get_parsing_patterns(utype)
    
    if utype == "ec" or utype == "wc" or utype == "wp":
      text = "foo,par.paz-q.?ofoo,parrifoo"
      words = [ "foo", "par", ]
      if utype == "wc" or utype == "wp":
        words += [ "[.]foo", "foo[.]", "[.]foo[.]", "foo[.]par", ]
        assert clean_sepa == '.'
      elif utype == "ec":
        assert clean_sepa == ''
    elif utype == "ps":
      text = " tóng yè wèi. zhǔ zhì è zhe zhǔ zhì yīn. pí, zhǔ wǔ zhì, shāng ǔ. ǔ."
      words = [ "zhǔ", "zhǔ zhì" ]
      assert clean_sepa == ' '
    elif utype == "ch":
      text = "白石英：［味］甘微温。［主治］消渴，阴痿不足，欬逆白石，青石黑石脂等：［味］甘平。［主］黄疸，泄利"
      words = [ "白石", "主治" ]
      assert clean_sepa == ''
    else:
      assert False
      
    err.write(f"!*   raw text =     '{text}')\n")
    text = clean_up_raw_text(text, utype, data_error)
    err.write(f"!*   cleaned text = '{text}')\n")
    
    units, ct_sepa = \
      split_text_into_units(text, utype, pat_unit, pat_sepa, data_error)
    err.write(f"!*   {ct_sepa = } {units = !r}\n")
    err.write("\n")

    for word in words:
      err.write(f"!*     looking up '{word}'\n")
      oclist = list_occurrences(word, units, clean_sepa, utype, data_error)
      err.write(f"!*     found = {oclist}\n")
      err.write("\n")
  return
  # ----------------------------------------------------------------------

if len(sys.argv) > 1 and sys.argv[1] == "test":
  test_stuff()