#! /usr/bin/python3 # Last edited on 2025-07-29 20:51:09 by stolfi # To be imported by other Python programs import sys, re from ivtff_format import strip_comments def enum_words_in_text(text, psep, visit): # Processes a line of {text} Voynichese in IVTFF-ish format and # enumerates the words in it with the respective expected number of # occurrences. # # Assumes that each ',' (uncertain word space) is either a '.' # (definite word space) with probbaility {psep}, or not a word space # with probability {1-psep}. Calls {visit(prb,wd)} on every possible # word {wd} that may result this way. There may be multiple calls with # the same word {wd}; however, for each distinct {wd}, the sum of all # {prb} in all calls with that word is the expected number of # occurrences of {wd} in {text}. # # Ignores any embedded comments "" and markers like "<%>" and # "<$>" as well as leading and trailing alignment markers [«=»]. # Ignores leading and trailing word delimiters [-,.]. Treats any # string of [-.,] with at least one [-.] as a single '.'. Treats ay # string of [,] as a single ','. # text = strip_comments(text) text = re.sub(r"[,]*[-.][-.,]*", ".", text) text = re.sub(r"[,]+", ",", text) fld = re.split(r"[.]", text) for i in range(len(fld)): if len(fld[i]) > 0: enum_words_in_block(fld[i], psep, visit) return def enum_words_in_block(block, psep, visit): # Assumes that {block} is a string of EVA text with no # inline comments "", no locators like "<%>" and "<$>", # no alignment markers [«=»], no definite delimiters [-.]. # # After removing leading, trailing, and redundant [,] delimiters # (uncertain spaces), the procedure simulates the process of # independently turning each ',' into a '.', with a given probability # {psep}, or deleting it, with probability {1-psep}, in all possible ways. # If there are {n} commas in the block, there are {2^n} such # possibilities. # # Each possibility {Q} where exactly {k} commas are replaced by '.' is # assigned a probability {pr(Q) = psep^k*(1-psep)^(n-k)}. The block {block}, # with these substitutions, is ideally split at the '.' into {k+1} # words {wd[0..k]}, and the procedure {visit(pr(Q),wd[i])} is called for # each word {wd[i]}. # # For example, if {p} is {1/4} and {block} is "aa,bb,aa,cc" there are # 8 possibilities, with these probabilities: # # "aa.bb.aa.cc" {1/4^3 = 1/64} # "aa.bb.aacc" {3/4^3 = 3/64} # "aa.bbaa.cc" {3/4^3 = 3/64} # "aa.bbaacc" {3^2/4^3 = 9/64} # "aabb.aa.cc" {3/4^3 = 3/64} # "aabb.aacc" {3^2/4^3 = 9/64} # "aabbaa.cc" {3^2/4^3 = 9/64} # "aabbaacc" {3^3/4^3 = 27/64} # # Then, for instance, the calls to {visit} would ideally include # # "aa" with probs {1/64, 1/64, 3/64, 3/64, 9/64, 3/64} (total {20/64}). # "aabb" with probs {3/64, 9/64} (total {12/64}) # "bb" with probs {1/64, 3/64} (total {4/64}) # ... # "aabbaacc" with prob {27/64} # # In reality, the procedure will often combine several calls with the # same word into one call with the sum of all the probs {pr(Q)}. # In particular, in the example above, the calls to {visit} will be # # "aa" with probs {1/4 = 16/64, (1/4)*(1*4) = 4/64} (total {20/64}). # "aabb" with prob {(3/4)*(1/4) = 12/64} # "bb" with prob {(1/4)*(1*4) = 4/64} # ... # "aabbaacc" with prob {27/64} # # Thus the total number of calls to {visit} will actually be # {choose(n+2,2)=(n+1)*(n+2)/2} instead of {2^n}. Still, the sum of all # parameters {prb} passed to {visit} will be the same as in the # ideal (exponential) version above. # block = block.strip(",") # sys.stderr.write(f" block = {block}\n") wds = re.split(r"[,]+", block) wds = [ wd for wd in wds if len(wd) != 0 ] # sys.stderr.write(f" wds = {wds}\n") m = len(wds) for i in range(m): for j in range(i+1, m+1): assert 0 <= i and i < j and j <= m pri = 1 if i == 0 else psep prj = 1 if j == m else psep prb = pri*prj*(1 - psep)**(j - i - 1) wij = "".join(wds[i:j]) visit(prb, wij) return def write_prob_and_word(prb, wd): sys.stdout.write(f"{prb:16.8e} {wd}") sys.stdout.write("\n") return