#! /usr/bin/python3

import sys, re
import html_gen as h
from process_funcs import bash
import html_report_funcs as hr

last_edit = "Last edited on 2026-01-19 22:48:41 by stolfi"

def main():

  global last_edit
  
  title = "Analysis of q-token gaps in parags"
  st = h.new_doc(title, "#eeffdd")
  
  h.section(st, 2, "Summary")

  h.parags(st, """A /q-token/ is a token (word occurrence) whose EVA transcription begins with the EVA glyphs 'qo' or 'oqo'.  This note examines the spacing between q-tokens in the paragraph text of the VMS.
  
  More specifically, a /plain token/ is a token that is not a q-token; a /q-gap/ is the list of plain tokens in a parag before the first q-token (a <b>BQ</b> gap), between two successive q-tokens (a <b>QQ</b> gap), or after the last q-token (a <b>QE</b> gap).  In particular, if the first token of a parag is a q-token, we have an empty BE gap.  If the last token is a q-token, we have an empty QE gap. And if two q-tokens occur in conseutive positions, we have an empty QQ gap.
  
  This note reports statistics on the lengths of those three kinds of q-gaps in the Starred Parags ('str') section of the Voynich Manuscript (VMS). The motivation was to test the hypthesis that the qo and oqo glyphs could be start-of-sentence markers, or more generally could be clues to sentence structure like subject case markers, verbal tense markers, etc. The results were not what I had expected, but are intriguing nonetheless.""")
 
  h.section(st, 2, "Results")
 
  def dofig(gtype):
    png_file = f" ../out/str-{gtype}-hist.png"
    jpg_file = f"images/str-{gtype}-hist.jpg"
    bash(f"convert {png_file} {jpg_file}")
    hr.basic_figure(st, jpg_file, None, jpg_file)
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
  h.parags(st, """The following histograms show how many q-gaps there are of each size and type, in the Starred Parags ('str') section.  First, the BQ gaps:""")
    
  dofig('BQ' )

  h.parags(st, """This plot already shows that the distribution of q-tokens in a paragraph is not random, at least until the first q-token.  If each token could be plain or q-token with the same probability at any position, idependently (that is, the q/plain attribute was defined by a Markov process of order zero), then the number of q-gaps (of any type) of size <i>k</i> would be a decaying exponential function <i>A</i>*<i>p</i>**<i>k</i>, where <i>p</i> = 0.8222; shown in the plots as the blue line with dots.  
    
    As we can see, compared to the random model, there is a clear excess of parags with 2 to 6 plain tokens before the first q-token, and a scarcity of parags where the first q-token occurs at the beginning (gap size 0) or only after 7 to 11 plain tokens. """)
    
  dofig('QE' )

  h.parags(st, """A similar pattern is visible in the QE plot.  There is a relative excess of parags with 5 or 6 plain tokens after the last q-token, and a relative scarcity of parags that end with a q-token, or with exactly 2 or 8 plain tokens. """)
    
  dofig('QQ' )

  h.parags(st, """On the other hand, pairs of /successive/ q-tokens (QQ gaps of size 0) are much more common than expected, and ditto for pairs separated by a single plain token; whereas pairs separated by three plain tokens are visibly less common than expected.
  
  The QQ gap histogram apparently excludes the possibility that the @qo and @oqo prefixes could be sentence separators or reliable markers of start-of-sentence.  Anyway, the BQ and QE plots show that the q-tokens must have some "syntactic" role that prevents them from being close to the start or end of a parag. Is not obvious how one could reproduce hese deviations from the zero-order Markov model with some other simple random generator.""")

  h.section(st, 3, "BE gaps")

  h.parags(st, """There were a few parags with no q-tokens at all. In such cases the entire parag is a q-gap, of a separate type (<b>BE</b>).  In the zero-order Markov model, these BE gaps too have an exponentially decaying distribution, with the same exponent.  However, there were too few of them to yield a meaningful plot.""") 

  # h.section(st, 2, "Other sections")
  # 
  # h.parags(st, """Similar patterns are observed in other sections, even though the 
  #   fraction of tokens that are q-tokens varies.  For Herbal-A ('hea'), the plots are:""")
  #   
  # for gtype in ('BQ', 'QE', 'QQ'):
  #   png_file = f" ../out/hea-{gtype}-hist.png"
  #   jpg_file = f"images/hea-{gtype}-hist.jpg"
  #   bash(f"convert {png_file} {jpg_file}")
  #   hr.basic_figure(st, jpg_file, None, jpg_file)
  # 
  # h.parags(st, """Here the expected decay rate is <i>p</i> = 0.8962.""")

  h.section(st, 2, "Details")

  h.section(st, 3, "Input file cleanup")

  h.parags(st, """For this analysis, an EVA transcription of the parags text from selected pages was reformatted by joing all lines of each parag into a single sequence of tokens. Line breaks internal to the parag, EVA dubious space codes ',',  and figure intrusion markers '-' were converted to EVA word spaces '.'.
  
  All ligature indicators (braces '{}' or parentheses '()') were removed.  All inline comments '<!...>' and the parag start/end codes '<%>' and '<$>' were deleted too.  All weirdo codes were converted to the invalid glyph code '?', and the text was then mapped to lowercase.  That turned the text into a string containing only lowercase letters, '?' codes, and dots '.'.  The dots divided each parag into a sequence of one or more non-empty tokens.  
  
  Then the q-tokens in each parag were identified, and the gaps between them were recorded separately for each type (BQ, QQ, QE).
  
  For this analysis, a token was considered invalid if it contained a q glyph but did not start with qo or oqo; or if it started with '?' or 'o?', so that it could not be determined if it was a q-token or a plain one.  Any q-gaps that contained invalid tokens were excluded from the plot.
 """)

  h.section(st, 3, "Handling of dubious spaces")

  h.parags(st, """
    For the plots above, the EVA dubious space codes ',' were mapped randomly to either '.' or nothing, at random, with equal probability. 
    
    This hack had relatively little impact on the statistics above. Changing the probability of '.' to 0% or 100% only shifted the histgrams a little, without affecting the qualitative conclusion -- that the q-gap sizes are far from random.""")

  h.section(st, 3, "Source transcription file")

  h.parags(st, """The transcription used for this study was a new one that I created manually from the BL 2014 images. Currently it is complete only for the 'str' section.  Compared to Rene's, it has somewhat different parag breaks (327 parags, four of which have no star "bullet" but are evident for other features), plain EVA letters in place of some weirdos (including @'Ih', @'ITh', etc., replaced by @'Ch', @'CTh', etc.), and a bit more commas.
  
  In the EVA transcription system it is understood that the horizontal arm of the 'q' is ligated to the 'o' at midline.  While the actual VMS the two are sometimes disconnected, we assumed that those cases are just pen glitches by the scribe.""")

  hr.links_section(st)

  h.output_doc(st, sys.stdout, 99, last_edit)
  return 0
  # ----------------------------------------------------------------------

main()