#! /usr/bin/python3

import sys, re
import html_gen as h
from process_funcs import bash
import html_report_funcs as hr

last_edit = "Last edited on 2025-11-19 07:37:31 by stolfi"

def main():

  global last_edit
  
  title = "Detecting ink under paint"
  st = h.new_doc(title, "#eeffdd")
  
  h.parags(st, """This webpage is about the many places in the
    Voynich Manuscript (VMS) where glyphs, words, or entire lines 
    were (or are suspected to have been) obfuscated by being painted 
    over with a watercolor or tempera type of paint.  The paint is often 
    semi-transparent, given some hope that the original text and drawings 
    can be recovered with the help of image processing techniques. 
    Such recovery could improve the transcription of the text
    affect the interpretation of the figures.""")
    
  h.section(st, 2, "Evidence for obscured ink")

  h.parags(st, """The Painter who applied the semi-opaque tempera colors
    often painted over inked outlines. Examples are easily seen where
    these inked strokes were still dark and clear, like (A,B) below.""")
    
  hr.online_image(st, ???f079r1-green-over-ink, or maybe the whole pool)
  
  h.parags(st, """Besides obscuring those strokes, it seems that the
    painting also washed away some of the ink, and sometimes deposited
    it a short distance away, as in (D).

    Thus any ink strokes that were already quite faint and faded, like
    (E), must have become invisible to the naked after being painted
    over.
    
    In particular, there seems to be something between the feet of that
    nymph, and maybe for something under the east end of the "backrest"
    she is leaning onto.""")

  h.section(st, 2, "Principles of Bayesian ink-paint separation")
  
  h.parags(st, """The idea is as follows.""")
  
  h.begin_enum(st, "ul")
  
    h.enum_item_parags(st, """Take an image of an area which
      is suspected of having "invisible" drawings or text under some
      semi-opaque paint.""")
      
    h.enum_item_parags(st, """Select a set of pixels A representative of what
      one wants to detect, like places where there is definitely ink
      covered by green paint.""")
      
    h.enum_item_parags(st, """Select one or more additional sets B, C, ...
      that are to be distinguished from A -- like places where there is
      green paint with but almost surely without ink underneath.""")
      
    h.enum_item_parags(st, """Look at
      the colors of those pixels as points of three-dimensional space,
      within the unit cube where (0,0,0) is black, (1,1,1) is white,
      (1,0,0) is red, etc. 
      
      Here is an example with three subsets of a page, 
      representative of blank vellum (red), dark text ink (green), and
      green paint over blank vellum (blue):""")
      
    h.enum_item_image(st, ???color clouds)
    
    h.enum_item_parags(st, """Approximate each cloud ou points A, B, C,
      ... by a trivariate Gaussian probability density function (PDF).
      This can be visualized as a fuzzy ellipsoid with varied dimensions
      along three axes, with some generic orientation in space.""")
      
    h.enum_item_parags(st, """Take each pixel of the image and use
      Bayes's formula to estimate the probability that the pixel belongs
      to each distribution A, B, C, ... or is an "outlier" that probably
      does not belong to any of them.""")
      
    h.enum_item_parags(st, """Write one grayscale image for each set,
      showing the probability of each pixel belonging to that set.""")
      
  h.end_enum(st, "ul")
  
  h.parags(st, """As others have pointed out, the texture of 
    the vellum itself provides a lot of subtle lines and curves,
    which can easily be confused for traces of drawing or text.
    
    However, the above approach i honest at least in the sense that the
    final classification is made independently for each pixel, based
    only on its color; without trying to look for multi-pixel patterns
    like lines or characters. Which is where actual pareidolia comes in.
    It will be left to the human user to "see" such patterns on the
    computed probability maps. The user's choice of sample pixels will
    influence the classification, but only through their colors, not
    through their positions or adjacency relations.
    
    This classification method is rather robust in the sense that 
    small perturbations of a pixel's color will usually not affect the
    classification, unless that color is on the transition between two 
    provinces with overlapping color distributions; and even then the 
    change in the classification (the probability of the pixel belonging to 
    each propvince) will be gradual.""")
    
  h.section(st, 3, "Challenges and limitations")
  
    h.parags(st, """The following image (a clip of f79r magnified 400%) shows
      some of the challenges on the way to uncovering drawings that have been
      painted over.""")
  
    hr.image(st, "f79r magnified 400%")
    
    h.parags(st, """There are some ink traces, like (A), that were quite 
      dark to begin with
      and suffered little from being painted over, apart from their color
      getting mixed with the paint color.

      But there are some ink traces, like (B), that are very faint, almost
      invisible. Would we be able to recover them if they were painted over? I
      think that smudges at ( C) look like they were the original outline of
      the nymph's left foot which was incorrectly retraced as the thick dark
      stroke a bit further to the NW. Those smudges are only a little fainter
      than the toes of that same foot, which almost certainly were there.

      To make things worse, when ink strokes were painted over, sometimes the
      ink would dissolve and either would spread around, or would be pushed by
      the brush for a small distance; as seems to have happened at (D).
      Presumably, some strokes were completely washed away and spread over a
      larger area, mixed with the green paint.

      At (E) there are some smudges which look like those at ( C) or at the
      nymph's toes, and they seem to form a rounded shape that does not seem
      to be just a random stain or vellum defects. Will we be able to
      determine if it was indeed painted-over ink, and, if so, recover enough
      of it to tell what it is?""")
      
  h.section(st, 3, "Case study: leaf on f22r")
  
    h.parags(st, """It has been concetured that there is some writing
      under the green paint on f22r.  Specifically, the front leaf of
      the second leaf bundle on the left side of the stem.""")
      
    hr.image(st, "f22r leaf 2,W - clip")
    
    h.parags(st, """However, there is some bleedthrough of the dark brown ink of
      the other side (f22r) that may be responsible for those darker streaks
      on that clip:""")
      
    hr.image(st, "f22r leaf 2,W - pilc")
    hr.image(st, "f22r leaf 2,W - blip")
    
    hr.parags(st, """Image B is a clip of the matching area on f22r (as accurately
      as I could determine it), flipped left-to-right for convenient
      comparison. Image C is image B with inverted colors and ~50%
      transparency overlaid on image A.

      On the left half of mage A, notice the faint ghosts of the berries of
      image B. On image C, notice that there are many berries on the other
      side of your area of interest.

      However, the ghosts are not that strong, so maybe there are indeed
      inked details under the green paint.""")
      
  h.section(st, 3, "Case study: ochre structure on f35r")
  
      hr.image(st, "f35r???")
      
    h.parags(st, """There does seem to be writing inside that big ocher-painted area,
      but is it just bleedthrough or offset?

      Offset from f34v can be excluded since there is no writing on f34v
      anywhere near that area. It could be offset from some other page, that
      happened back when the bifolios were still unbound.

      But it does look like bleedthrough from f35v. Here is a clip of f35v
      corresponding to that area, flipped L-R for easy comparison:""")
      
    h.parags(st, 
      """It does look like most if not all of the "writing under the ocher paint"
      is indeed bleed-through of the writing on f35v. Here is the second clip
      with colors inverted and transparent background, overlaid on the first
      clip:

      There still seems to be some ink traces in the right half of that 
      structure, not attributable to bleedthrough.  They may be decoration.""")
  
   h.section(st, 3, "Using Bayesian classification for retracing")
  
    h.parags(sr, """As others have noted, the colors of the text pixels 
      are a continuum between full ink color and bare vellum color.
      That holds at least for the orginal and first retrace (Rt1) inks.  
      These two inks sem to have the same hue and differ mostly in 
      brightness, and both merge with the blank parchment ink 
      as they become fainter, and along the edges of the strokes.""")
  
  h.section(st, 3, "Using pixel windows instead of single pixels")
  
    h.parags(st, """A more sophisticated variant of this method
      would use pixel windows instead of single pixels.  Namely, for each 
      pixel on the image, we extract the color of that pixel and of
      a fixed set of M nearby pixels, such as the 4 nearest neighbors (M=5)
      or a 3x3 window centered at the pixel (M=9). Then each sample and
      each pixel to be classified becomes a point of R^{3*M}, and the 
      distributions for each province are multivariate Gaussians.  This approach
      may be valuable to recover strokes that are only a pixel wide.""")
    
  h.section(st, 3. "Prior probabilities")
  
    h.parags(st, """A fundamental limitation of Bayesian inference is that it does not
      give the probabilities of each possible cause, it only tell us how to
      change out a priori probabilities. So we must specify those priors in
      order to use the formula.

      When the evidence is strong enough, the results do not depend much on
      the priors. These matter only when the evidence is ambiguous. In the
      worst case, when the evidence carries no information about the cause,
      the results will be the same as the priors.

      When it rains at night, streets are usually wet by the next morning. If
      you have no special information about the night's weather, and you see
      the street wet in the morning, it is reasonable to assume that it
      rained. Bayes's formula will say so, whether your prior for "it rained"
      is 1% or 99%.

      That's because P(Wet|NoRain) is very small; say, 0.0001 (but not zero,
      because there may have been a flood or a street washing truck mat have
      showed up. While P(Wet|Rain) is basically 1. Therefore
      P(Rain)P(Wet|Rain) is still much bigger than P(NoRain)P(Wet|NoRain) in
      any case, and these numbers will become ~1 and ~0 after normalization.

      But if you are sure that it did not rain, because you have been out
      stargazing in the garden all night, the wet street should not make you
      change your belief. That's because your P(Rain) will be essentially
      zero, and then P(Rain)P(Wet|Rain) will become much smaller than
      P(NoRain)P(Wet|NoRain), even if P(Wet|NoRain) is only 0.00001.
      
      For the analysis of the f79r pool illustrated above the prior
      probability of "OTHER" was set arbitrarily at 0.05, and that of
      each of the three other classes was set to 0.95/3 = ~0.32.""")   
  
  h.section(st, 3, "Comparison with linear vector machines")
  
    h.parags(st, """This approach is somewhat similar to the so-called 
      linear vector machine (LVM) classification.  In that method,
      each object to be classified (in this case, a pixel, or a 
      small pixel window) is a vector of some d-dimensional space R^d,
      and any two classes are separated by a set of linear functions.
      
      In contrast, Bayesian classification with Gaussian
      distributions is inherently non-linear, and usually extremely so.
      For instance, imagine that you have only two Gaussian classes (plus
      "OTHER"), where class A has a very broad spherical distribution centered
      at middle gray (0.5,0.5,0.5) and class B has a much narrower one
      centered at slightly darker gray (0.4,0.4,0.4). Bayesian classification
      will assign class A to colors inside the A sphere, except within a small
      region around the darker gray, where it will say B. A linear classifier
      will be unable to delimit even the A sphere, much less the B hole inside
      it.

      That is why linear vector classifiers are usually applied to non-linear
      functions of the inputs, the (improperly) so called "kernels". Which
      requires the user to come up with suitable kernels. If one tries to use
      as kernels all polynomials on the input coordinates up to degree (say)
      4, one gets so many kernels that the classification will probably be
      garbage.

      That is also a danger if one uses "magical" non-linear classifiers with
      zillions of internal parameters, like neural networks...""")
    
  h.section(st, 3, "Local vs. global training")
  
    h.parags(st, """The samples of each province can be collected
      locally, on the same page or even on the same figure under
      analysis; or globally, scatered all over the book.
      
      Global sampling could be slightly safer against accidental color
      variations affecting only a small area of a page, such as stains
      or ink blots.
      
      However, local sampling is justified because there seems to be
      some overall variation from folio to folio in the colors of
      parchment, ink, and paint. For instance, the green paint of f8r
      seems to be more bluish than that used in the Bio section.

      And there is also a much larger variety of paints and stains over
      the whole MS. On the southwest pool of f79r, for example, 
      it is not necessary to include the
      red, blue, yellow, and rusty paints as separate provinces. They
      will be classified as "OTHER" without significantly affecting the
      classification of the other provinces of interest. And we
      don't have to worry about ketchup stains, or the gray offset from blue
      flowers, that are important "noise" features on some other pages.""")
    
  h.section(st, 3, "Imaging requirements")
    
    h.parags(st, """Ideally we should do this with high-resolution.
    The resolution should be high enough for 
    the thinnest ink strokes to be several pixels across,
    so that there will be pixels entirely inside the stroke.
    
    Ideally the images should also be uncompressed (to avoid JPEG
    encoding artifacts), taken under frontal illumination (to reduce
    brightness variations due to the roughness of the vellum surface)
    with multiple narrow-band light from ultraviolet to infrared (to
    distinguish colors that just look the same), and with linear
    encoding (so that the color clouds will not be distorted away from
    the ideal Gaussian shape).
    
    Unfortunately we don't have multispectral scans for any of the pages
    that may have significant details hidden under the paint. And even
    those that we do have are taken with oblique lighting that creates
    light and dark spots at every tiny bump on the vellum surface.
    
    So we must do with the Beinecke 2014 scans, which have 
    frustratingly low resolution (some ink traces being only a couple
    of pixels across), only the three RGB color coordinates, oblique 
    illumination, non-linear "gamma" encoding, and complex JPEG 
    compression artifacts.""")
    
  h.section(st, 3, "Where should we look")
  
  h.parags(st, """The herbal pages have green paint, but the ink that can
    be seen under it is just boring nervures or leaf outlines. At best,
    those images could be useful to validate this approach.""")
    
  h.section(st, 2, "When was the paint applied?")

  h.parags(st, """According to Rene, microscope examination of the folio number on f42 
    shows definitely that the paint was on top of the ink. The paint included small
    crystals which were obviously on top of everything else. Besides, it 
    seems unlikely that whoever wrote the folio number
    would choose to write it over an already painted area.""")
 
  h.section(st, 2, "Some examples")
  
  h.parags(st, """Instances of ink traces being painted over 
    are rather common. Here are some annotated examples.""")
  
  h.parags(st, """*NOTE*: The claims in image captions are all
    personal guesses with varied degrees of confidence. For brevity,
    they are stated as facts; however, the reader should assume
    disclaimers such as "apparently", "probably", "it seems that", etc.
    before every claim that is not totally evident from the images.""")
  
  bash("(cd images && ls -d f[0-9][0-9][0-9][rv][0-9]-* ) | sort > .files")
  hr.clip_fig_links_and_pages_enum(st, fnames)
    
  h.output_doc(st, sys.stdout, 99, last_edit)
  return 0
  # ----------------------------------------------------------------------
  
def test_html_gen():
  txt = "We need (/weed/) but not (*knot*)"
  h.err("[[" + txt + "]] -> [[" + h.simple_markup(txt) + "]]\n")
  return 
  # ----------------------------------------------------------------------

# test_html_gen()
main()