Hacking at the Voynich manuscript - Side notes
040 Verbal descriptions of VMS pages 

Last edited on 2004-11-27 18:56:05 by stolfi

  The purpose of this Note is to compile and format the 
  page-by-page descriptions of the VMS, both text and
  illustrations.
  
  This page covers the preparation of the 16e6 release only.
  For the 20e1 release, see note 071.

COLLECTING THE PAGE LISTS

  Gathering page lists:

    ln -s ../../L16+H-eva
    
    ( cd L16+H-eva && gawk -v FS=':' '/./{print $2}' INDEX ) \
      > .all.units
    cat .all.units | egrep -v '[.]' > .all.pages
      
    set units = ( `cat .all.units` )
    set pages = ( `cat .all.pages` )
    
  Checking whether the INDEX is consistent with the directory
  contents:

    cat .all.units | sort > .foo
    ( cd L16+H-eva && ls f[0-9]* ) | grep -v '~' | sort > .bar
    diff .foo .bar
    
EXTRACTING THE VERBAL DESCRIPTIONS

  The verbal descriptions were originally included in the page header
  files of the EVA interlinear, as "Description" and "Comments"
  sections. I thought best to remove them to a separate package,
  leaving only the more formal "Identification" and "Attributes" sections.
  
  Extracting all "Description" and "Comments" sections to 
  separate files:
  
    mkdir desc+comm

    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat L16+H-eva/$f \
        | sed -e '/^# *[Dd]escription[:]* *$/,$\!d' \
        > desc+comm/$f
      cat desc+comm/$f | wc -lc 
    end
    
  Detaching the "Identification" and "Attributes" sections:
  
    mkdir attr

    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat L16+H-eva/$f \
        | sed -e '/^# *[Dd]escription[:]* *$/,$d' \
        > attr/$f
      cat attr/$f | wc -lc 
    end
    
  Checking whether we lost anything:
  
    foreach f ( $pages )
      set t = "`cat L16+H-eva/$f | wc -l`"
      set d = "`cat desc+comm/$f | wc -l`"
      set a = "`cat attr/$f | wc -l`"
      @ e = $t - ( $a + $d )
      /usr/bin/printf "%-6s %4d (%d - %d - %d)\n" $f $e $t $a $d
    end

  Moving the "Identification/Attributes" sections to the interlinear
  directory:
  
    ( cd L16+H-eva && tar cvf - ${pages} | gzip > 1999-01-01-page-descs.tgz )
    
    foreach f ( $pages )
      mv -v attr/$f L16+H-eva
    end

  Splitting Description sections from Comment sections:
  
    mkdir comm
    
    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat desc+comm/$f \
        | sed -e '/^# *[Cc]omment[s]*[:]* *$/,$\!d' \
        > comm/$f
      cat comm/$f | wc -l 
    end
    
    mkdir desc
    
    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat desc+comm/$f \
        | sed -e '/^# *[Cc]omment[s]*[:]* *$/,$d' \
        > desc/$f
      cat desc/$f | wc -l 
    end
    
  Checking whether we lost anything:
  
    foreach f ( $pages )
      set t = "`cat desc+comm/$f | wc -l`"
      set d = "`cat desc/$f | wc -l`"
      set c = "`cat comm/$f | wc -l`"
      @ e = $t - ( $c + $d )
      /usr/bin/printf "%-6s %4d (%d - %d - %d)\n" $f $e $t $d $c
    end
  
  Doing the same with the "Identification" and "Attributes" sections:
  
    mkdir iden
    
    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat L16+H-eva/$f \
        | sed \
            -e '1,/^# *[Ii]dentification*[:]* *$/d' \
            -e '/^# *[Aa]ttributes*[:]* *$/,$d' \
        > iden/$f
      cat iden/$f | wc -l 
    end
    
    mkdir attr
    
    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat L16+H-eva/$f \
        | sed \
            -e '1,/^# *[Aa]ttributes*[:]* *$/d' \
        > attr/$f
      cat attr/$f | wc -l 
    end
    
  Extracting the EVMT header lines:

    mkdir evhd 
    
    foreach f ( $pages )
      /usr/bin/printf "%-6s " $f
      cat L16+H-eva/$f \
        | egrep '^##' \
        > evhd/$f
      cat evhd/$f | wc -l 
    end
    
  Making sure we didn't lose anything:
  
    cat `echo $pages | tr ' ' '\012' | sed -e 's:^:{evhd,iden,attr}/:g'` \
      | egrep -v '^# *$' \
      > .foo

    (cd L16+H-eva && cat ${pages} ) \
      | egrep -v '^# Last edited' \
      | egrep -v '^# Identification[:]* *$' \
      | egrep -v '^# Attributes[:]* *$' \
      | egrep -v '^# *$' \
      > .bar

    dicio-wc .bar .foo
    
      lines   words     bytes file        
    ------- ------- --------- ------------
       3375   17035    100389 .bar
       3375   17035    100389 .foo

    diff .bar .foo > .diff
    
    cat .bar | egrep '^##' | dicio-wc

      lines   words     bytes file        
    ------- ------- --------- ------------
        257    1644     11092 

    cat .bar | egrep 'Quire:' | dicio-wc

      lines   words     bytes file        
    ------- ------- --------- ------------
        257    1799      9463 

  Moving the EVMT header files back to the interlinear directory:
  
    ( cd L16+H-eva && tar cvf - ${pages} | gzip > 1999-01-29-headers.tgz ) 
    
    ( cd evhd && mv -v ${pages} ../../../L16+H-eva/ )

    rmdir evhd
  
    mkdir bibl

  Edited manually the comment files, moving the reference lists 
  to the bibliography directory "bibl"

  Cleaning up the files:
  
    foreach dir ( iden attr desc comm bibl )
      echo '=== '${dir}' ==='
      ( cd ${dir} && filter-files ../cleanup-page-descr `ls ${pages}` )
    end

REJOINING THE PAGE FILES

  Editing 5 separate files per page was a pain, so I decided to 
  rejoin them into one file per page.
  
  First, checking whether we got everything for every page:

    cat .all.pages | sort > .foo
    foreach dir ( iden attr desc comm bibl ) 
      echo '=== '${dir}' ==='
      ( cd $dir && ls ) | grep -v '~' | sort > .pages.${dir}
      diff .pages.${dir} .foo
    end
    
  Now let's join them:
  
    mkdir data 
    
    foreach p ( ${pages} )
      echo "=== $p ==="
      if ( -r data/$p ) /bin/mv data/$p data/$p~
      echo "#" >>  data/$p
      echo "# Identification:" >> data/$p
      echo "#" >>  data/$p
      cat iden/$p | egrep -v '^# *Last edited' >> data/$p
      echo "# Attributes:" >> data/$p
      echo "#" >>  data/$p
      cat attr/$p | egrep -v '^# *Last edited' >> data/$p
      echo "# Description:" >> data/$p
      echo "#" >>  data/$p
      cat desc/$p | egrep -v '^# *Last edited' >> data/$p
      echo "# Comments:" >> data/$p
      echo "#" >>  data/$p
      cat comm/$p | egrep -v '^# *Last edited' >> data/$p
      echo "# References:" >> data/$p
      echo "#" >>  data/$p
      cat bibl/$p | egrep -v '^# *Last edited' >> data/$p
      echo "#" >>  data/$p
      echo '# Last edited on DATE TIME by USER' >> data/$p
      set oldl = "`cat {iden,attr,desc,comm,bibl}/${p} | wc -l | tr -d '\012'`"
      set newl = "`cat data/$p | wc -l | tr -d '\012'`"
      /usr/bin/printf "old = %s  new = %s\n" ${oldl} ${newl} 
    end
    
  Checking for lossage:
  
    /bin/rm -f .bar
    foreach dir ( iden attr desc comm bibl ) 
      echo '=== '${dir}' ==='
      ( cd $dir && cat `ls | grep -v '~' | sort` ) \
        | egrep -v '^# *Last edited'\
        | egrep -v '^# *$' \
        >> .bar
    end
    ( cd data && cat `ls | grep -v '~' | sort` ) \
      | egrep -v '^# *Last edited'\
      | egrep -v '^# *(Identification|Attributes|Description|Comments|References): *$'\
      | egrep -v '^# *$' \
      | egrep -v '^## <.*$' \
      > .baz
    dicio-wc .bar .baz

FORMATTING THE DESCRIPTIONS 

  Formatting all page descriptions as HTML files:

    set release = "16e6"
    
    # mkdir html

    set prev = 'NONE'
    set nxs = ( ${pages[2-]} 'NONE' )
    foreach pg ( ${pages} )
      echo '=== '${pg}' ==='
      set next = "${nxs[1]}"
      set nxs = ( ${nxs[2-]} )
      make-html-page ${pg} ${prev} ${next} < data/${pg} > html/${pg}.htm
      set prev = "${pg}"
    end
    
EXTRACTING SOME USEFUL TABLES

  Extracting tables that maps the page f-number to 
  
    bifolio code  bQN, where Q is an uppercase letter denoting
                  a quire, and N is a digit that gives the
                  nesting depth.
                  
    Rene's page number - two uppercase letters XY where
                  X denotes the quire and Y the page in
                  that quire.
  
    foreach datum ( bifolio qpnum )
      set fbtbl = "fnum-to-${datum}.tbl"
      /bin/rm -f ${fbtbl}
      foreach f ( $pages )
        set bif = "`cat data/$f | extract-${datum}`"
        echo "${f}: ${bif}"
        if ( "/${bif}" == "/" ) then
          echo "${f}: ${datum} data not found"
        else
          echo "${f} ${bif}" >> ${fbtbl}
        endif
      end
    end

  Checking for completeness:

    foreach datum ( bifolio qpnum )
      echo '=== '"${datum}"
      cat ../../fnum-to-pnum.tbl \
        | gawk '/./{print $1;}' \
        | sort > .foo
      cat fnum-to-${datum}.tbl \
        | gawk '/./{print $1;}' \
        | sort \
        > .bar
      diff .foo .bar

    end
    
  Had to edit manually fnum-to-qpnum.tbl to account for the 
  merging of some pages in Rene's numbering into single 
  pages in the f-numbering.  
  
    mv -i fnum-to-{bifolio,qpnum}.tbl ../..

  Creating the reverse mapping.

    cat fnum-to-qpnum.tbl \
      | gawk '/./{print $2, $1;}' \
      | sort \
      > qpnum-to-fnum.tbl

  Edited it to implement the merging maps several qpnums to the same
  fnum.

    mv -i qpnum-to-fnum.tbl ../..