Hacking at the Voynich manuscript - Side notes
058 Probabilistic model for Voynichese words

      [ ******* MEN AT WORK ******** ]
      [ ** PLEASE COME BACK LATER ** ]
      [ ******* MEN AT WORK ******** ]

Last edited on 2001-01-16 19:08:25 by stolfi

INTRODUCTION

  This note attempts to build a probabilistic grammar
  that approximates the VMS word distribution.
  
  [ Being redone on 2000-10-10 to exclude the letters <x>, <v>, <g>, <j>, 
    and weird ligatures such as <cthh>, <ith>, <cs>, etc. ]
  
SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../../compute-cum-cum-freqs
    ln -s ../../compute-cum-freqs
    ln -s ../../compute-freqs
    ln -s ../../combine-counts
    ln -s ../../remove-freqs
    ln -s ../../totalize-fields
    ln -s ../../select-units
    ln -s ../../words-from-evt
    ln -s ../../factor-field-OK
    ln -s ../../format-counts-packed

SELECTING THE SIGNIFICANT SECTIONS FOR GRAMMAR ADJUSTMENT

  We now select among the data sets data/gud/{text,labs}/${sec}.wfr
  those that are worth matching the grammar against, namely
    
    the text tokens of each section, data/gud/text/${sec}.wfr,
    with the sections "unk.*" excluded, renamed
    prob/obs/${sec}/word.frq
    
    the text tokens in the whole book, data/gud/text/tot.n.wfr,
    renamed prob/obs/txt.n/word.frq
    
    ditto excluding line-initial and line-final tokens, data/gud/text/mid.n.wfr,
    renamed prob/obs/mid.n/word.frq
    
    the label tokens in the whole book, data/gud/labs/tot.n.wfr,
    renamed prob/obs/lab.n/word.frq

  In order to keep the mantle grammar simple, We also resolve the
  <eee> and <eeee> ambiguities by preprocessing the observed data. Our
  hack is to map each <ee> group to <bh>, by analogy with <ch> and
  <sh>. Let's cross the fingers and hope that we remember to handle or
  undo this hack at all proper times.

  Ok, here we go:

    mkdir prob prob/obs prob/gen

    foreach sec ( ${secs} ) 
      set infile = "data/gud/text/${sec}.wfr"
      set frfile = "prob/obs/${sec}/word.frq"
      echo "${infile} -> ${frfile}"
      if ( ! -d ${frfile:h} ) mkdir ${frfile:h}
      /bin/rm -f ${frfile}
      cat ${infile} \
        | resolve-eee-ambiguities -v inField=3 \
        > ${frfile}
    end

    foreach ft ( labs.lab text.txt )
      set f = ${ft:r}; set t = ${ft:e}
      set infile = "data/gud/${f}/tot.n.wfr"
      set frfile = "prob/obs/${t}.n/word.frq"
      echo "${infile} -> ${frfile}"
      if ( ! -d ${frfile:h} ) mkdir ${frfile:h}
      /bin/rm -f ${frfile}
      cat ${infile} \
        | resolve-eee-ambiguities -v inField=3 \
        > ${frfile}
    end

    set infile = "data/gud/text/mid.n.wfr"
    set frfile = "prob/obs/mid.n/word.frq"
    echo "${infile} -> ${frfile}"
    if ( ! -d ${frfile:h} ) mkdir ${frfile:h}
    /bin/rm -f ${frfile}
    cat ${infile} \
      | resolve-eee-ambiguities -v inField=3 \
      > ${frfile}

GRAMMAR FILES

  The generic probabilistic grammars are in gram/generic/txt.n/${part}.grx,
  where ${part} is "word", "core", "mantle", "sufmantle", etc.
  
  Further section-specific grammars are stored in
  gram/${grclass}/${sec}/${part}.grx, and ${grclass} is 
  
    manual   - adjusted by hand to the section
    
    trivial  - trivial (straight enumeration) grammars,
               mechanically generated from observed freqs
              
    adjusted - derived from the generic grammar, by
               mecanical adjustment of rules based 
               on observed word freqs.
               
  Word frequencies generated from those grammars are in
  prob/gen/${grclass}/${sec}/${part}.grx

MANUALLY CONSTRUCTED GENERIC GRAMMAR

  Comparing, adjusting, and re-comparing the generic grammar:
  
    unset sec
    unset grclass
    
    mkdir prob prob/{gen,cmp}
    mkdir prob/{gen,cmp}/generic
  
    /bin/rm -rf prob/gen/generic/txt.n
    mkdir prob/{gen,cmp}/generic/txt.n

    set parts = ( sword )
    
  Manual updating of counts:
  
    set sec = txt.n
    set part = sword
    set grfile = "gram/generic/${sec}/${part}.grx"
    echo "updating counts of ${grfile}..."
    cat ${grfile} \
      | filter-grammar \
      > ${grfile}+
    if ( ( ! ${status} ) && ( ! -z ${grfile}+ ) ) then 
      mv ${grfile} ${grfile}- && mv ${grfile}+ ${grfile}
    endif

  Updating and plotting all grammars:

    foreach part ( ${parts} )
      echo generic/txt.n/${part}
      process-grammar -truncate 0.00001 generic txt.n ${part}
    end
    
COMPARING GRAMMAR COUNTS IN VARIOUS SECTIONS

  Creating directories:

    unset grclass
    set part = "word"

    set secs = ( pha.2 cos.2 str.2 bio.1 hea.1 heb.1 )
    mkdir gram/adjusted prob/{gen,cmp}/adjusted
    mkdir gram/cmp-fr gram/cmp-ct
    foreach sec ( txt.n lab.n ${secs} )
      mkdir gram/adjusted/${sec}
      mkdir prob/gen/adjusted/${sec}
      mkdir prob/cmp/adjusted/${sec}
      mkdir gram/cmp-fr/${sec}
      mkdir gram/cmp-ct/${sec}
    end

  Obtaining rule counts for the generic grammar in various
  sections:
  
    foreach sec ( txt.n ${secs} lab.n )
      set oldgram = "gram/generic/txt.n/${part}.grx"
      set obsprob = "prob/obs/${sec}/${part}.frq"
      set newgram = "gram/adjusted/${sec}/${part}.grx"
      echo "${oldgram} + ${obsprob} -> ${newgram}"
      cat ${oldgram} \
        | parse-and-tally \
            -v maxderivs=1 -v countprec=0 -v ignorecounts=1 \
            -v wordcounts=${obsprob} \
        > ${newgram}
    end

  Computing and comparing predicted frequencies:

    foreach sec ( txt.n ${secs} lab.n )
      check-grammar -truncate 0.00001 adjusted ${sec} ${part}
    end

  Comparing grammar counts for several sections:
  
    set xsecs = ( txt.n ${secs} lab.n )
    set xlst = `echo "${xsecs}" | tr ' ' ','`
    set files = ( `echo gram/adjusted/{${xlst}}/${part}.grx` )
    echo "${files}"
    compare-grammars \
        -v freqs=1 -v prec=4 \
        -v titles="${xsecs}" \
        ${files} \
      > gram/cmp-fr/word.grx
    compare-grammars \
        -v freqs=0 \
        -v titles="${xsecs}" \
        ${files} \
      > gram/cmp-ct/word.grx
      
=== STOPPED HERE ============================================================

FORMATTING THE GRAMMAR

  Formatting the grammar for all words (minus labels):
  
    set htmldir = "/home/staff/stolfi/public_html/voynich/00-06-07-word-grammar"
    cat gram/adjusted/txt.n/word.grx \
      | html-format-grammar \
          -v title="A Grammar for Voynichese Words" \
      > ${htmldir}/txt.n.html
  
  Formatting the comparative grammars:
  
    set htmldir = "/home/staff/stolfi/public_html/voynich/00-06-07-word-grammar"
    cat gram/cmp-fr/word.grx \
      | html-format-grammar \
          -v title="Rule Frequencies for Various Sections" \
      > ${htmldir}/cmp-fr.html
    cat gram/cmp-ct/word.grx \
      | html-format-grammar \
          -v title="Rule Counts for Various Sections" \
      > ${htmldir}/cmp-ct.html

THE MYSTERY OF ISOLATED Es

  The following commands extract and count isolated "e"s,
  their immediate contexts, and the containing words with
  coremantles set off with "()":
  
    foreach m ( 1 2 3 4 5 6 7 8 )
      cat prob/obs/txt.n/word.frq \
        | gawk -v m=${m} \
            ' /./{ \
                 z = gensub(/([CKGche](|.*[CKGche]))/, "(\\1)", "g", $3); \
                 w = ("#" $3 "#"); \
                 gsub(/[cs]h/,"C",w); gsub(/[ktpf]/,"K",w); gsub(/cKh/,"G",w); \
                 while (match(w, /[^e][e]+[^e]/)) \
                   { s = substr(w,RSTART,RLENGTH); \
                     w = substr(w,RSTART+RLENGTH-1); \
                     if (length(s) == m+2) { print $1, (s ":" z); } \
                   } \
               } \
             ' \
        | combine-counts | compute-freqs \
        | tr ':' ' ' \
        | sort -b +2 -3 +0 -1nr \
        > .ee-${m}
    end
    
    cat .ee-[1-9] \
      | gawk '/./{print $1,substr($3,2,length($3)-2);}' \
      | combine-counts | compute-freqs \
      | sort -b +0 -1nr \
      > .ee-n

       8994 0.68059 e
       3895 0.29474 ee
        321 0.02429 eee
          5 0.00038 eeee
            
  Note the large drop from "ee" to "eee" (> 10-fold) 
  again from "eee" to "eeee" (> 50-fold), and the total absence of 5 or 
  more "e"s in a row.  The puny 5 "eeee" exceptions are
  almost certainly due to accidents, such as loss of ligature
  in "ch" or "sh":
  
      ykeeeedaiir
      keeees
      deeeese
      oeeees
      qoeeeey
  
  One explanation for the lack of "eeee" is that "ee" (and possibly
  "eee") are letters, but "eeee" would be a doubled-letter, which is
  illegal in Voynichese. To confirm this theory, let's count the
  occurrences of "chch" and "shsh":
  
    cat prob/obs/txt.n/word.frq \
      | gawk \
          ' /./{ \
               w = ("#" $3 "#"); \
               gsub(/ch/,"C",w); gsub(/sh/,"S",w); \
               while (match(w, /[^CS][CS]+[^CS]/)) \
                 { s = substr(w,RSTART+1,RLENGTH-2); \
                   w = substr(w,RSTART+RLENGTH-1); \
                   print $1, s; \
                 } \
             } \
           ' \
      | combine-counts | compute-freqs \
      | sort -b +0 -1nr \
      > .chch

       9762 0.70739 C
       4002 0.29000 S
         13 0.00094 SC
         11 0.00080 CS
         10 0.00072 CC
          2 0.00014 SS
  
  So indeed double letters, or even the pairs "chsh" and "shch",
  are essentially nonexistent. 
  
  The theory is that isolated "e"s in the pre-mantle are
  pre-modifiers for the following gallows.  Let's check it out:

  The following commands extract and count the words that contain an
  isolated "e", and sort them by the right and left contexts of that "e":
  
    cat prob/obs/txt.n/word.frq \
      | sed -e 's/sh/ch/g' -e 's/[ktpf]/k/g' \
      | gawk \
          ' /./{ \
               w = ("#" $3 "#"); \
               gsub(/[cs]h/,"C",w); gsub(/[ktpf]/,"K",w); gsub(/cKh/,"G",w); \
               while (match(w, /[^e][e][^e]/)) \
                 { s = substr(w,RSTART,RLENGTH); \
                   w = substr(w,RSTART+RLENGTH-1); \
                   print $1, (s ":" $3); \
                 } \
             } ' \
      | combine-counts | compute-cum-freqs \
      | tr ':' ' ' \
      | sort -b +4 -5 +0 -1nr \
      > .single-e
       
  Then let's compute the frequencies of the immediate pre-context,
  post-context, and symmetric context of the isolated "e":
  
    foreach ij ( 1.1 3.1 1.3 )
      cat .single-e \
        | gawk -v i=${ij:r} -v j=${ij:e} \
            '/./{ print $1,substr($5,i,j); }' \
        | combine-counts | compute-freqs \
        | sort -b +0 -1nr \
        > .single-e-ctx-${ij}
    end

    multicol -v titles='left right both' .single-e-ctx-{1.1,3.1,1.3}

               left               right              both               
      -----------------  -----------------  -------------------
         5763 0.64076 C     3323 0.36947 d     2170 0.24127 Ced
         2483 0.27607 K     2340 0.26017 o     1405 0.15622 Ceo
          461 0.05126 G     1951 0.21692 y     1244 0.13831 Cey
          129 0.01434 o      519 0.05771 K     1030 0.11452 Ked
           38 0.00423 #      321 0.03569 a      800 0.08895 Keo
           33 0.00367 q      199 0.02213 G      415 0.04614 Key
           32 0.00356 d      146 0.01623 C      386 0.04292 CeK
           32 0.00356 s      121 0.01345 s      259 0.02880 Gey
            9 0.00100 l       55 0.00612 #      211 0.02346 Cea
            4 0.00044 a       10 0.00111 r      194 0.02157 CeG
            4 0.00044 r        4 0.00044 g      125 0.01390 KeC
            3 0.00033 y        3 0.00033 l       96 0.01067 Geo
            2 0.00022 c        1 0.00011 i       94 0.01045 Ged
            1 0.00011 h        1 0.00011 m       90 0.01001 Kea
                                                 88 0.00978 Ces
                                                 72 0.00801 oeK
                                                            ...
                                                  1 0.00011 sea
                                                  1 0.00011 ser
                                                  1 0.00011 yey
                                                  
    
  Let's redo this analysis after deleting the round letters
  (but preserving the identity of "e"-clusters):
  
    cat prob/obs/txt.n/word.frq \
      | sed -e 's/sh/ch/g' -e 's/[ktpf]/k/g' \
      | gawk \
          ' /./{ \
               w = ("#" $3 "#"); \
               gsub(/eeee/,"4",w); gsub(/eee/,"3",w); gsub(/ee/,"2",w); \
               gsub(/[cs]h/,"C",w); gsub(/[ktpf]/,"K",w); gsub(/cKh/,"G",w); \
               gsub(/[aoy]/,"",w); \
               while (match(w, /.[e]./)) \
                 { s = substr(w,RSTART,RLENGTH); \
                   w = substr(w,RSTART+RLENGTH-2); \
                   print $1, (s ":" $3); \
                 } \
             } ' \
      | combine-counts | compute-cum-freqs \
      | tr ':' ' ' \
      | sort -b +4 -5 +0 -1nr \
      > .single-e-noo
       
    foreach ij ( 1.1 3.1 1.3 )
      cat .single-e-noo \
        | gawk -v i=${ij:r} -v j=${ij:e} \
            '/./{ print $1,substr($5,i,j); }' \
        | combine-counts | compute-freqs \
        | sort -b +0 -1nr \
        > .single-e-noo-ctx-${ij}
    end

    multicol -v titles='left right both' .single-e-noo-ctx-{1.1,3.1,1.3}
 
               left               right              both               
      -----------------  -----------------  -------------------
         5786 0.64332 C     3916 0.43540 d     2467 0.27429 Ced
         2493 0.27718 K     2268 0.25217 #     1472 0.16366 Ce#
          462 0.05137 G      875 0.09729 l     1288 0.14321 Ked
           78 0.00867 #      676 0.07516 K      528 0.05871 CeK
           67 0.00745 q      472 0.05248 r      513 0.05704 Cel
           40 0.00445 s      258 0.02869 s      479 0.05326 Ke#
           35 0.00389 d      238 0.02646 G      320 0.03558 Kel
           12 0.00133 e      158 0.01757 C      314 0.03491 Cer
           11 0.00122 l       57 0.00634 i      270 0.03002 Ge#
            5 0.00056 r       52 0.00578 m      229 0.02546 CeG
            2 0.00022 2       12 0.00133 e      162 0.01801 Ces
            2 0.00022 c        4 0.00044 g      133 0.01479 KeC
            1 0.00011 h        3 0.00033 2      129 0.01434 Ged
                               3 0.00033 n      127 0.01412 Ker
                               1 0.00011 3       71 0.00789 Kes
                               1 0.00011 q       48 0.00534 #eK
                                                 43 0.00478 qeK
                                                 33 0.00367 Cei
                                                            ...
                                                  1 0.00011 reK
                                                  1 0.00011 sem

  We can see that the round letters do not play a significant
  role in the identification of single "e" letters, since 
  they rarely (18 occs only) occur between two "e" strings.
  
  Let's also look at the symmetric contexts after collapsing 
  all non-gallows, non-bench, non-single-e letters to "@":
  
    foreach f ( e e-noo )
      cat .single-${f}-ctx-1.3 \
        | gawk '/./{gsub(/[^KGCe]/, "@", $3); print $1,$3; }' \
        | combine-counts | compute-freqs \
        | sort -b +0 -1nr \
        > .single-${f}-ctx-smash
    end
    
    multicol -v titles='normal no-rounds' .single-e{,-noo}-ctx-smash

            normal               no-rounds          
      -------------------  -------------------
         5167 0.57449 Ce@     5001 0.55604 Ce@
         2357 0.26206 Ke@     2323 0.25828 Ke@
          457 0.05081 Ge@      528 0.05871 CeK
          386 0.04292 CeK      455 0.05059 Ge@
          194 0.02157 CeG      229 0.02546 CeG
          149 0.01657 @e@      133 0.01479 KeC
          131 0.01457 @eK      127 0.01412 @e@
          125 0.01390 KeC      106 0.01179 @eK
          ---------------      ---------------
           16 0.00178 CeC       31 0.00345 KeK
            5 0.00056 @eC       20 0.00222 CeC
            2 0.00022 @eG        8 0.00089 Cee
            2 0.00022 GeG        7 0.00078 eeK
            2 0.00022 GeK        4 0.00044 @eC
            1 0.00011 KeG        4 0.00044 GeK
                                 4 0.00044 ee@
                                 3 0.00033 @eG
                                 3 0.00033 GeG
                                 3 0.00033 KeG
                                 3 0.00033 Kee
                                 1 0.00011 @ee
                                 1 0.00011 eeC

  Note the clear split between "valid" and "non-valid" 
  patterns, highlighted with "-----", which is largely
  irrelevant of the round letters!
  
  The following facts can be observed from this output:
  
    * Isolated "e" occurs almost exclusively (~99%) adjacent to 
      a gallows or bench.
      
    * Isolated "e"  rarely occurs between two benches (20 occs only),
      between a non-coremantle and a bench (5 occs), 
      and NEVER between a platform gallows and a bench.
      
  Thus it seems that isolated-"e" occurs mainly at transitions
  between crust, mantle, and core.
      
PARTIAL GRAMMAR CHECKS

  The following commands compare partial sufcrust distributions with the 
  corresponding subgrammar of the suffix grammar.
  
  Comparing sufcrust distribution after "e"-sufmantle:
  
    cat prob/obs/${sec}/suffix.frq \
      | gawk \
         ' /./{ w=(" " $3); \
             if(match(w, /[^e][e][^he]*$/)) \
               {w=substr(w,RSTART+2); print $1,(w==""?"=":w);} \
           } \
         ' \
      | combine-counts | compute-freqs \
      > prob/obs/${sec}/.se.frq
    cat gram/${grclass}/${sec}/suffix.grx \
      | enum-language -v axiom=SE \
      > prob/gen/${grclass}/${sec}/.se.prb
    compare-probs ${grclass} ${sec} .se .se
  
  Comparing sufcrust distribution after "ch/sh"-sufmantle:
  
    cat prob/obs/${sec}/suffix.frq \
      | gawk \
         ' /./{ w=$3; \
             if(match(w, /[h][^he]*$/)) \
               {w=substr(w,RSTART+1); print $1,(w==""?"=":w);} \
           } \
         ' \
      | combine-counts | compute-freqs \
      > prob/obs/${sec}/.sx.frq
    cat gram/${grclass}/${sec}/suffix.grx \
      | enum-language -v axiom=SX \
      > prob/gen/${grclass}/${sec}/.sx.prb
    compare-probs ${grclass} ${sec} .sx .sx
  
  Comparing sufcrust distribution after empty-sufmantle:
  
    cat prob/obs/${sec}/suffix.frq \
      | gawk \
         ' /./{ w=$3; \
             if(match(w, /^[^he]*$/)) \
               { print $1, (w==""?"=":w); } \
           } \
         ' \
      | combine-counts | compute-freqs \
      > prob/obs/${sec}/.sz.frq
    cat gram/${grclass}/${sec}/suffix.grx \
      | enum-language -v axiom=S \
      > prob/gen/${grclass}/${sec}/.sz.prb
    compare-probs ${grclass} ${sec} .sz .sz
  
  Comparing global sufcrust distribution:
  
    cp -p prob/obs/${sec}/{sufcrust,.s}.frq
    cat gram/${grclass}/${sec}/suffix.grx \
      | enum-language -v axiom=Sdb \
      > prob/gen/${grclass}/${sec}/.s.prb
    compare-probs ${grclass} ${sec} .s .s
  
  Comparing gobal sufmantle distribution:

    cp -p prob/obs/${sec}/{sufmantle,.n}.frq
    cat gram/${grclass}/${sec}/suffix.grx \
      | enum-language -v axiom=Ndb \
      > prob/gen/${grclass}/${sec}/.n.prb
    compare-probs ${grclass} ${sec} .n .n
    
  Comparing premantle part of prefix:
  
    cp -p prob/obs/${sec}/{premantle,.m}.frq
    cat gram/${grclass}/${sec}/prefix.grx \
      | enum-language -v axiom=Mdb \
      > prob/gen/${grclass}/${sec}/.m.prb
    compare-probs ${grclass} ${sec} .m .m
    
  Obtaining distribution of (a) coreless words, and
  (b) words with core, minus the core:
  
    cat prob/obs/${sec}/word.frq \
      | gawk  ' ($3 \!~ /[ktpf]/){ print $1, $3; } ' \
      | sort -b +0 -1nr \
      | compute-freqs \
      > prob/obs/${sec}/.wcm.frq
  
    cat prob/obs/${sec}/word.frq \
      | gawk  \
          ' ($3 ~ /[ktpf]/){ \
              gsub(/[q]*[aoy]*[ci]*[ktpf][h]*[e]*/, "", $3); \
              if ($3 == "") { $3 = "="; } \
              print $1, $3; \
            } \
          ' \
      | combine-counts \
      | sort -b +0 -1nr \
      | compute-freqs \
      > prob/obs/${sec}/.wcx.frq

  Comparing distribution of words with non-empty core
  with gram/${grclass}mar word-1:
  
    cat prob/obs/${sec}/word.frq \
      | gawk  '($3 ~ /[ktpf]/){ print $1, $3; }' \
      | sort -b +0 -1nr \
      | compute-freqs \
      > prob/obs/${sec}/.w1c.frq
    
    cat gram/${grclass}/${sec}/word-1.grx \
      | enum-language -v axiom=WCp \
      > prob/gen/${grclass}/${sec}/.w1c.prb
    compare-probs ${grclass} ${sec} .w1c .w1c
  
    cp -p prob/obs/${sec}/{word,.m}.frq
    cat gram/${grclass}/${sec}/word-1.grx \
      | enum-language -v axiom=W \
      > prob/gen/${grclass}/${sec}/.m.prb
    compare-probs ${grclass} ${sec} .m .m