#! /usr/bin/python3

import sys, re
import html_gen as h
from process_funcs import bash
import html_report_funcs as hr

last_edit = "Last edited on 2026-02-09 15:28:10 by stolfi"

def main():

  global last_edit

  title = "[076] Locating parag breaks in the Starred Parags Section"
  st = h.new_doc(title, "#eeffdd")

  h.section(st, 2, "Summary")
  h.parags(st, """This webpage reports an attemp at locating the paragraph breaks in
  the <b>Starred Paragraphs</b> section (SPS) of the Voynich Manuscript (VMS),
  also known as the Recipes section. 
  
  This page has a general discussion of the problem and methodology. For the 
  detailed page-by-page report, see <a href="sec-per-page/page.html">this sub-report</a>.""")

  h.section(st, 2, "Paragraphs and why they matter")

  h.parags(st, """ The text of the SPS clearly consists of multiple
  paragraphs ('parags' for short), each comprising one or more lines. Each
  line normally starts at the left margin of the text and continues up
  to the right margin, or to the end of the paragraph, whichever comes
  first. 
  
  Considering the cost of vellum and the difficulty of erasing it,
  we assume that the Author (the person who decided to create the book,
  devised the script, created or obtained the information, etc.) first
  wrote a draft the SPS on paper, and then recruited a Scribe to
  traspose the final draft to vellum. We assume that both Author and
  Scribe understood that the parags were significant, while the line
  breaks within each parag were not; so that the Scribe disregarded the
  line breaks in the draft, and introduced new ones whenever he/she
  reached the right margin. (While the Scribe <i>may</i> have been the
  Author him/herself, there is evidence that they were different
  persons, and that the Scribe did not know much about the meaning of
  text, probably not even its language.)

  Identifying the paragraphs <i>as intended by the Author</i> is necessary
  for certain analyses, such as comparing the frequencies of words at in
  the start, middle, and end of paragraphs, or trying to identify words
  that occur preferably at the end of sentences (like the verbs in
  German and other SOV languages).
  
  There are several features that are believed to indicate those breaks;
  but they often disagree or are absent, and there are several blocks of
  consecutive lines that, based on internal and external clues, can be
  presumed to consist of two or more parags run together. Locating the
  paragraph breaks within those blocks requires some more or less
  arbitary choices, guided by the available clues. 
  
  This article describes a set of criteria for guiding these choices,
  and shows the results of applying them to each page of the SPS.""")

  h.parags(st, hr.html_subdoc_link(st, "sec-glossary", "page", thumb_img = None, link_text = "Glossary of general terms"))

  h.parags(st, hr.html_subdoc_link(st, "sec-transcription", "page", thumb_img = None, link_text =  "The transcription file"))

  h.section(st, 3, "Marking the parags")
  
  h.parags(st, """Ultimately the chosen parag breaks were marked
  manually in the transcription file, by the prefix "<%>" on head line
  and the suffix "<$>" on the tail line, as explained in <a
  href="sec-transcription/page.html">this sub-document</a>. On each head
  line we also added by hand the the star assignment comments
  "<!S<i>NN</i>>" and "<!NoS>" The comments "<!WGP>" and "<!WGN>" were
  used to mark wide linegaps, whether they were chosen parag breaks or
  not.
  
  A program was then used to evaluate each of these marked parags
  according to the criteria described below. The results are reported in
  <a href="sec-per-page/page.html">this sub-document</a>.""")

  h.section(st, 2, "Stars in the SPS")

  h.parags(st, """The left margin of every page of the SPS contains a
  column of <b>stars</b>.  For the purposes of this report, the stars in 
  each page are designated S01, S02, etc., from the top down.""")
  
  h.enum_item_parags(st,
    hr.html_subdoc_link(st, "sec-star-glossary", "page", thumb_img = None, link_text = "Glossary of star-related terms")
  )

  h.section(st, 3, "Appearance")

  h.parags(st, """
  The stars of the SPS look very similar to those in the Zodiac section,
  and some of those in the Cosmo section. Each star is typically ~6 mm
  across. Its outline is drawn with the same ink as the text, and
  consists of 6 to 9 <b>rays</b>, typically ~1.5 mmm wide at the base
  and ~2 mm long, with two straight or slightly curved sizes, that form
  a sharp or slightly rounded point. The bases of the rays define a
  round <b>body</b> ~2mm across.

  The number of rays may carry some information, but there is no
  evidence for that.""")

  h.section(st, 3, "Colors")

  h.parags(st, """
  A few stars are <b>clear</b> -- just outlined, not painted. All the
  others are partially or totally painted with one of two colors:
  <b>yel</b>, a partly transparent watercolor-like golden yellow paint
  (apparently the same paint/ink used on the hair of the nymphs of the
  Zodiac section), or <b>red</b>, an opaque tempera-like dark red paint
  (presumably the same used for the lips of nymphs in the Zodiac,
  flowers in the Herbal section, etc.)
  
  The cores of stars that are painted red is usually invisible, while
  that of those painted yel is normally visible through the paint.

  The two colors may have been applied at different times by
  different people (the Light Painter and the Dark Painter,
  respectively), who may have had different levels of knowledge about
  the VMS. Thus it is possible that the yel color on stars has some
  information, for instance about parag breaks, while the red color has
  not.
  
  On most pages the colors red and yel aternate. Sometimes, as in f104r
  and f115r, there are hiccups where a color is repeated. On some pages,
  like f108r, the colors seem to be random, and several stars are
  neither red nor yel (left unpainted or maybe painted white). Since the
  colors may well be a later addition, we just ignored them when
  choosing the parag breaks.
  
  """)

  h.section(st, 3, "Tails")

  h.parags(st, """
  Most stars in the SPS have a <b>tail</b>, which is a thin curvy
  line extending down from the star's outline.  
  
  The tail usually extends or starts from the tip of a ray near
  05:00. In this case the ray is often sharper, longer, and curved. (This
  indicates that the tail was drawn by the same Scribe who drew the
  stars.) But sometimes the tail starts at the notch between two rays.
  
  On some stars the ray is extended to most of the length of the
  tail, thus creating a <b>fat tail</b>.  The fat tails may have some
  information too. For instance, a fat tail may signify that the star is
  associated with two parag heads that are too close together to receive
  individual stars.
  
  However, there doesn't seem to be a sharp distinction between fat and thin tails.
  The difference is only how far the fat part of the tail extends.""")

  h.section(st, 3, "Drawing order")

  h.parags(st, """ 
  While the finished outline of each star is a single
  continuous line, it may be drawn in two or more separate strokes.
  
  In some thin-tailed stars, the outline was drawn first and the tail
  was added as a separate stroke. In other cases the drawing of the
  outline apparently started at a ray tip around 05:00, went CW all
  around, then instead of stopping at the starting point continued out
  to form the (single) tail.  
  
  In the case of stars with fat tails, sometimes the star outline is
  drawn as two or more strokes in opposite directions, starting at a ray
  tip in the general NW area, until a ray tip around around 05:00, and
  each stroke is then extended to make one edge of the tail. See S13 in
  f113r for example.
  
  Other times a star with fat tail is drawn by drawing a thin-tail star
  first (in either of the pssible ways), and then adding the left edge
  of the tail as a separate stroke. See the tail of S06 in f114v for
  example.""")

  h.section(st, 3, "Line assignment and starlets")

  h.parags(st, """Presumably, the intention of the Author or
  Scribe was to have one star for each parag of the SPS, aligned with
  the parag's head line, like the bullet in an item of an itemized list.
  
  However the reality is somewhat far from this ideal. A star that is
  presumably associated with an obvious parag is often located at some
  distance, up or down, from the head, or is missing entirely. Part of
  the task of identifying the paragraph breaks is to assign each star to
  a text line, in a one-to-one way; and deciding which text lines
  (and, in some cases, which parags) will be left without stars.""")

  h.section(st, 2, "Easy cases: Perfect parags")

  h.section(st, 3, "Page boundaries and titles")
  
  h.parags(st, """
  For reasons detailed below, we assumed that 
  each parag is wholly contained in a single page. That is, 
  page boundaries are hard parag boundaries.
  
  The SPS contains three short centered or right-justified lines
  that are generally assumed to be section titles or some such.
  See <a href='sec-transcription/page.hrml#Titles'>this document</a>.
  We assume that those titles do not belong to any parag,
  so they are themselves hard parag boundary markers too.
  
  It is possible that there are other section headers were not
  recognized as such.  We have no way to check for that;
  such headers, if they exist, have been included in parags.""")
  
  h.section(st, 3, "Parag properties")
  
  h.parags(st, """To identify a set of one or more consecutive text lines,
  not including any "titles",
  as a paragraph, We used some or all of the following criteria:""")
  
  h.begin_enum(st, "ul")

  h.enum_item_parags(st, """<b>P1</b>. The first of these lines follows
    a short line <b>or</b> a wide linegap (or is the first line in the page, 
    or follows a 'title').""")

  h.enum_item_parags(st, """<b>P2</b>. The last of these lines is short
    <b>or</b> precedes a wide linegap (or is the last line of the page, or 
    precedes a 'title').""")

  h.enum_item_parags(st, """<b>P3</b>. All lines other than the last one
    are long lines.""")

  h.enum_item_parags(st, """<b>P4</b>. There are no puffs in any of
    these lines, except possibly in the first line.""")

  h.enum_item_parags(st, """<b>P5</b>. The first of these lines has an
    assigned starlet.""")

  h.enum_item_parags(st, """<b>P6</b>. None of these lines, except the
    first one, has an assigned starlet.""")

  h.enum_item_parags(st, """<b>P7</b>. There are no wide linegaps <i>between</i>
    any two of these lines.""")

  h.enum_item_parags(st, """<b>P8</b>. All lines are left-justified
    (start on the left rail).""")

  h.end_enum(st, "ul")

  h.parags(st, """
  Note that P4 does not *require* the existence of puffs in the head
  line of a perfect parag. It only prohibits them in the other
  lines.
  
  Rules P1 and P2 effectively require that a perfect parag be entirely
  contained within one page.""")

  h.section(st, 3, "Perfect parags")
  
  h.parags(st, """
  A set of consecutive non-title lines that satisfies all conditions P1-P9
  will be called a <b>perfect parag</b>.
  
  By the above criteria, one can identify ??? perfect parags in the SPS,
  covering ??? or the ??? text lines. When those are excluded, the lines
  that remain ???""")

  h.section(st, 3, "Quasi-perfect parags")

  h.parags(st, """
  If a set of consecutive lines satisfies all the conditions P1-P9 except P4, 
  (that is, if some line other than the first one contains puffs) we will 
  call those lines a <b>quasi-perfec parag</b>.
  
  There are ??? parags that are not perfect but quasi-perfect.""")

  h.section(st, 3, "Pluperfect parags")

  h.parags(st, """Two other conditions that are relevant for parag splitting are:""")

  h.begin_enum(st, "ul")

  h.enum_item_parags(st, """<b>Q1</b>. The first line of the set follows
    a short line <b>and</b> a wide linegap (or is the first line in the page, 
    or follows a 'title').""")

  h.enum_item_parags(st, """<b>Q2</b>. The first glyph of the
    head line is either a puff or a @t.""")

  h.end_enum(st, "ul")

  h.parags(st, """
  Note that condition Q1 is a stronger version of P1. If a set of lines is a
  perfect parag and satisfies either of these two criteria, we call it a
  <b>pluperfect</b> parag.""")

  h.section(st, 2, "Hard cases: Imperfect parags")

  h.section(st, 3, "Possible causes")

  h.parags(st, """ 
  While most of the text can be parsed as perfect
  parags, there are several 'imperfect blocks' of consecutive lines such
  that, in any block, any candidate parag fails at least one of the
  criteria P1-P8. Thus, within each of those imperfect blocks we had to
  determine where to break parags by less objective criteria.

  As discussed above, stars on the right margin do not seem to be reliable parag markers.
  they are often more than one line off from the head, or missing
  entirely.  There are many possible scenarios for the creation and final scribing of the SMS that
  would have led to starlets being omitted or misaligned by mistake.

  Another possible cause for dubious parag breaks is the Scribe
  sometimes starting a new parag on the same line as the tail of the
  previous parag, when the latter was a short line. For example, on
  <f105v.P1.6> the parag break should perhaps be after the first word
  {dcheo}. Likewise, on <f105v.P1.13>, the parag break should perhaps be
  after the first word {saiin}.""")

  h.section(st, 3, "Splitting the imperfect blocks")

  h.parags(st, """
  To split imperfect blocks, we set a 'definitive' parag break after every short line, even if
  the next line cannot get a starlet assigned to it. That line will be
  the start of an 'unstarred' parag.

  We also put a 'tentative' parg break before any line that has at least
  one puff, even if the previous line is not short and no starlet can be
  assigned to it.

  Those two decisions divide each imperfect block into 'tentative
  parags'. Each tentative parags is a set of consecutive lines such that
  no line except perhaps the first has any puffs or assigned starlet,
  and no line except perhaps the last one is short-length.
  
  The insertion of further parag breaks was guided by the 
  above criteria.  The detailed reports are in 
  <a href="sec-per-page/page.html">this sub-report</a>.""")

  hr.links_section(st)

  h.output_doc(st, sys.stdout, 99, last_edit)
  return 0
  # ----------------------------------------------------------------------

main()