#! /usr/bin/python3
# Last edited on 2025-07-29 20:51:09 by stolfi
# To be imported by other Python programs

import sys, re
from ivtff_format import strip_comments

def enum_words_in_text(text, psep, visit):
  # Processes a line of {text} Voynichese in IVTFF-ish format and
  # enumerates the words in it with the respective expected number of
  # occurrences.
  # 
  # Assumes that each ',' (uncertain word space) is either a '.'
  # (definite word space) with probbaility {psep}, or not a word space
  # with probability {1-psep}. Calls {visit(prb,wd)} on every possible
  # word {wd} that may result this way. There may be multiple calls with
  # the same word {wd}; however, for each distinct {wd}, the sum of all
  # {prb} in all calls with that word is the expected number of
  # occurrences of {wd} in {text}.
  #
  # Ignores any embedded comments "<!...>" and markers like "<%>" and
  # "<$>" as well as leading and trailing alignment markers [«=»].
  # Ignores leading and trailing word delimiters [-,.]. Treats any
  # string of [-.,] with at least one [-.] as a single '.'. Treats ay
  # string of [,] as a single ','.
  #
  text = strip_comments(text)
  
  text = re.sub(r"[,]*[-.][-.,]*", ".", text)
  text = re.sub(r"[,]+", ",", text)
  fld = re.split(r"[.]", text)
  for i in range(len(fld)):
    if len(fld[i]) > 0:
      enum_words_in_block(fld[i], psep, visit)
  return 

def enum_words_in_block(block, psep, visit):
  # Assumes that {block} is a string of EVA text with no 
  # inline comments "<!...>", no locators like "<%>" and "<$>",
  # no alignment markers [«=»], no definite delimiters [-.].
  #
  # After removing leading, trailing, and redundant [,] delimiters
  # (uncertain spaces), the procedure simulates the process of
  # independently turning each ',' into a '.', with a given probability
  # {psep}, or deleting it, with probability {1-psep}, in all possible ways.
  # If there are {n} commas in the block, there are {2^n} such
  # possibilities.
  # 
  # Each possibility {Q} where exactly {k} commas are replaced by '.' is
  # assigned a probability {pr(Q) = psep^k*(1-psep)^(n-k)}. The block {block},
  # with these substitutions, is ideally split at the '.' into {k+1}
  # words {wd[0..k]}, and the procedure {visit(pr(Q),wd[i])} is called for
  # each word {wd[i]}.  
  #
  # For example, if {p} is {1/4} and {block} is "aa,bb,aa,cc" there are
  # 8 possibilities, with these probabilities:
  #
  #   "aa.bb.aa.cc"   {1/4^3   =  1/64}
  #   "aa.bb.aacc"    {3/4^3   =  3/64}
  #   "aa.bbaa.cc"    {3/4^3   =  3/64}
  #   "aa.bbaacc"     {3^2/4^3 =  9/64}
  #   "aabb.aa.cc"    {3/4^3   =  3/64}
  #   "aabb.aacc"     {3^2/4^3 =  9/64}
  #   "aabbaa.cc"     {3^2/4^3 =  9/64}
  #   "aabbaacc"      {3^3/4^3 = 27/64}
  #
  # Then, for instance, the calls to {visit} would ideally include
  #
  #   "aa"       with probs {1/64, 1/64, 3/64, 3/64, 9/64, 3/64} (total {20/64}).
  #   "aabb"     with probs {3/64, 9/64} (total {12/64})
  #   "bb"       with probs {1/64, 3/64} (total {4/64})
  #   ...
  #   "aabbaacc" with prob {27/64}
  #
  # In reality, the procedure will often combine several calls with the
  # same word into one call with the sum of all the probs {pr(Q)}. 
  # In particular, in the example above, the calls to {visit} will be
  #
  #   "aa"       with probs {1/4 = 16/64, (1/4)*(1*4) = 4/64} (total {20/64}).
  #   "aabb"     with prob  {(3/4)*(1/4) = 12/64}
  #   "bb"       with prob  {(1/4)*(1*4) = 4/64}
  #   ...
  #   "aabbaacc" with prob {27/64}
  # 
  # Thus the total number of calls to {visit} will actually be
  # {choose(n+2,2)=(n+1)*(n+2)/2} instead of {2^n}. Still, the sum of all
  # parameters {prb} passed to {visit} will be the same as in the
  # ideal (exponential) version above.
  #  
  block = block.strip(",")
  # sys.stderr.write(f"  block = {block}\n")
  wds = re.split(r"[,]+", block)
  wds = [ wd for wd in wds if len(wd) != 0 ]
  # sys.stderr.write(f"  wds = {wds}\n")
  m = len(wds)
  for i in range(m):
    for j in range(i+1, m+1):
      assert 0 <= i and i < j and j <= m
      pri = 1 if i == 0 else psep
      prj = 1 if j == m else psep
      prb =  pri*prj*(1 - psep)**(j - i - 1)
      wij = "".join(wds[i:j])
      visit(prb, wij)
  return

def write_prob_and_word(prb, wd):
  sys.stdout.write(f"{prb:16.8e} {wd}")
  sys.stdout.write("\n")
  return