Hacking at the Voynich manuscript - Side notes
107 Computing and comparing word and token length distributions

Last edited on 2002-01-17 16:58:55 by stolfi

INTRODUCTION

  This note computes the token and word length distributions
  (in basic and OKOKOKO elements) and generates
  the plots that will go into the tech report.
  
!!! TO DO
!!! Rename the factoring scripts: 
!!!    trivial -> viqr/pinyin/acip/etc.
!!!    viqr/pinyin -> phon
!!! Split joined words in Vietnamese sample
!!! Combine the pinyin disambiguating suffix with tone in 1 digit
!!! Define a "rational pinyin" encoding with yi->i, wu->u
!!! 


SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../../capitalize-ligatures
    ln -s ../../compute-cum-cum-freqs
    ln -s ../../compute-cum-freqs
    ln -s ../../compute-freqs
    ln -s ../../combine-counts
    ln -s ../../remove-freqs
    ln -s ../../totalize-fields
    ln -s ../../select-units
    ln -s ../../words-from-evt
    ln -s ../../format-counts-packed
    ln -s ../../update-paper-include
    ln -s ../../factor-field-general
    
    ln -s ../100/sample
    
    ln -s ../../factor-text-trivial.gawk
    ln -s ../../factor-text-viqr.gawk
    ln -s ../../factor-text-pinyin.gawk
    ln -s ../103/factor-text-basic.gawk
    ln -s ../202/factor-text-oko.gawk

COMPUTING WORD AND TOKEN LENGTH DISTRIBUTIONS
  
  Paper directory:

    set tbldir = "/home/staff/stolfi/papers/voynich-stats/techrep/tables/auto"
    set figdir = "/home/staff/stolfi/papers/voynich-stats/techrep/figures/auto"

  Computing the Voynichese token and word length histograms in terms of basic
  and OKOKOKO glyphs. We also generate comparison plots of good and raw
  token lists, in order to ensure that the good-word selection did not
  introduce any significant bias in the length statistics.

    make -f vms-length-hists.make all

COMPARING TOKEN LENGTH DISTRIBUTIONS WITH OTHER LANGS
    
  Selecting the non-Voynichese samples:
  
    set sampelems = ( \
      engl/wow.trivial \
      engl/cul.trivial \
      latn/ptt.trivial \
      grek/nwt.trivial \
      span/qvi.trivial \
      geez/gok.trivial \
      viet/ptt.trivial \
      viet/ptt.viqr \
      tibe/vim.trivial \
      tibe/ccv.trivial \
      chin/ptt.pinyin \
      chin/ptt.trivial \
      chin/red.pinyin \
      \
      enrc/wow.trivial \
      chrc/red.trivial \
      \
      engn/wow.trivial \
      engp/cul.trivial \
    )
    
  Computing the token and word length distributions: 

    foreach se ( ${sampelems} )
      set sample = ${se:r}; set elem = ${se:e}
      make LANG=${sample:h} BOOK=${sample:t} ELEM=${elem} \
        -f other-length-hists.make single-sampelem
    end

  Summarizing average token lengths:
  
    foreach se ( {voyp,voyl}/vms.{basic,oko} ${sampelems} )
      set sample = ${se:r}; set elem = ${se:e}
      printf "%s %-8s " ${sample} ${elem}
      foreach tkwd ( t w )
        set afile = "sample/${sample}/tot.t/gud-fact-${elem}-${tkwd}-avlen.tex"
        set avg = "`cat ${afile} | sed -e 's:^.*{::' -e 's:}.*::'`"
        printf " %7s" "${avg}"
      end
      printf "\n"
    end

      voyp/vms basic        4.45    5.54
      voyp/vms oko          3.75    4.76
      voyl/vms basic        5.08    5.44
      voyl/vms oko          4.61    4.93
      engl/wow trivial      4.43    6.92
      engl/cul trivial      4.28    6.54
      latn/ptt trivial      5.44    7.74
      grek/nwt trivial      4.85    7.65
      span/qvi trivial      4.34    7.22
      geez/gok trivial      6.80    7.79
      viet/ptt trivial      4.34    4.69
      viet/ptt viqr         2.74    2.92
      tibe/vim trivial      3.53    4.28
      tibe/ccv trivial      3.29    4.14
      chin/ptt pinyin       3.52    3.74
      chin/ptt trivial      4.29    5.19
      chin/red pinyin       3.65    3.77
      enrc/wow trivial      4.83    5.82
      chrc/red trivial      5.21    5.59
      engn/wow trivial      7.09    7.26
      engp/cul trivial      6.26    6.64

  Plotting the distributions of texts and labels

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-1.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyn/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (both)" 1.00 1 1 \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)" 1.00 2 2 \
            sample/voyl/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (labs)" 1.00 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-2.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)" 1.00 1 1 \
            sample/engl/wow/tot.t/gud-fact-trivial-${tw}.lhi "English WotW"      1.00 2 2 \
            sample/engl/cul/tot.t/gud-fact-trivial-${tw}.lhi "English CHerb"     1.00 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-3.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)" 1.00 1 1 \
            sample/latn/ptt/tot.t/gud-fact-trivial-${tw}.lhi "Latin OTst"        1.00 2 2 \
            sample/grek/nwt/tot.t/gud-fact-trivial-${tw}.lhi "Greek NTst"        1.00 3 3 \
            sample/span/qvi/tot.t/gud-fact-trivial-${tw}.lhi "Spanish DQuijote"  1.00 4 4 \
            sample/geez/gok/tot.t/gud-fact-trivial-${tw}.lhi "Ethiopian GotK"    1.00 5 5 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-4.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)"    0.50 1 1 \
            sample/chin/ptt/tot.t/gud-fact-pinyin-${tw}.lhi  "Chinese OTst (phon)"  0.50 2 2 \
            sample/chin/ptt/tot.t/gud-fact-trivial-${tw}.lhi "Chinese OTst (piny)"  0.50 2 2 \
            sample/chin/red/tot.t/gud-fact-pinyin-${tw}.lhi  "Chinese DoRM (phon)"  0.50 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-5.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)"  0.67 1 1 \
            sample/tibe/vim/tot.t/gud-fact-trivial-${tw}.lhi "Tibetan VimSut"     0.67 2 2 \
            sample/tibe/ccv/tot.t/gud-fact-trivial-${tw}.lhi "Tibetan CmCmVR"     0.67 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-6.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyn/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (basic)"  0.50 1 1 \
            sample/viet/ptt/tot.t/gud-fact-viqr-${tw}.lhi    "Vietn. OTst (phon)"  0.50 2 2 \
            sample/viet/ptt/tot.t/gud-fact-trivial-${tw}.lhi "Vietn. OTst (viqr)"  0.50 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-7.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi   "Voynichese (text)"  0.80 1 1 \
            sample/enrc/wow/tot.t/gud-fact-trivial-${tw}.lhi "Engl. WotW RomCd"   0.80 2 2 \
            sample/chrc/red/tot.t/gud-fact-trivial-${tw}.lhi "Chin. DoRM RomCd"   0.80 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end
    
COMPARISON WITH BINOMIAL DISTRIBUTIONS

    set binplots = ( \
      voyn/vms:basic:9:1 \
      voyn/vms:oko:6.0:1.8 \
      viet/ptt:trivial:6.0:1.55 \
      chin/ptt:trivial:7.5:1.45 \
      tibe/ccv:trivial:4.5:1.9 \
      viet/ptt:viqr:0.9:2.4 \
      chin/ptt:pinyin:1.5:3.0 \
    )
    
    @ iplt = 0
    foreach plt ( ${binplots} )
      @ iplt = ${iplt} + 1
      set p = ( `echo ${plt} | tr ':' ' '` )
      set sample = "${p[1]}"; set elem = "${p[2]}"; 
      set ntot = "${p[3]}"; set nshf = "${p[4]}";
      set ymax = `gawk -v ntot=${ntot} 'BEGIN{printf "%.2f",0.9/sqrt(ntot);}'`
      foreach fmt ( eps png )
        foreach tw ( w )
          set ofile = "${sample}/tot.t/binom-${tw}-${elem}-lengths.${fmt}"; echo "${ofile}"
          compare-elem-count-distribs \
              -size 1.125,0.75 -freqs -ymax ${ymax} -xmax 15 -format ${fmt} \
              -binom ${ntot} ${nshf} \
              sample/${sample}/tot.t/gud-fact-${elem}-${tw}.lhi  "${sample} (${elem})" 1.00 1 1 \
            > sample/${ofile}
          update-paper-include sample/${ofile} ${figdir}/${ofile}
        end
      end
    end 


  Ditto, with the binomial distribution for reference:
  
    foreach fmt ( eps png )
      foreach tw ( w )
        set ofile = "binom-${tw}-lengths.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            -binom 9 1 \
            sample/voyp/vms/tot.t/gud-fact-basic-${tw}.lhi  "Voynichese (text)" 1.00 2 2 \
            sample/voyl/vms/tot.t/gud-fact-basic-${tw}.lhi  "Voynichese (labs)" 1.00 3 3 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}
      end
    end
    
  Ditto, for OKO elements:

    foreach fmt ( eps png )
      foreach tw ( t w )
        set ofile = "langs-${tw}-lengths-1-oko.${fmt}"; echo "${ofile}"
        compare-elem-count-distribs -size 1.50,0.75 -freqs -format ${fmt} \
            sample/voyn/vms/tot.t/gud-fact-oko-${tw}.lhi      "Voynich (oko)"   0.80 1 1 \
            sample/viet/ptt/tot.t/gud-fact-trivial-${tw}.lhi  "Viet. OTst. (viqr)" 0.80 2 3 \
            sample/engl/wow/tot.t/gud-fact-trivial-${tw}.lhi  "English WotW"       0.80 3 4 \
            sample/latn/ptt/tot.t/gud-fact-trivial-${tw}.lhi  "Latin OTst."        0.80 4 5 \
          > ${ofile}
        update-paper-include ${ofile} ${figdir}/
      end
    end
    
>>> STOPPED HERE <<<

>>> STOPPED HERE <<<
      
  Extracting sets of Voynichese words of same length:

    foreach sample ( text labs )
      foreach ekind ( basic oko )
        foreach len ( 01 02 03  09 10 11 )
          set ofile = "lang/${sample}/voyn-${ekind}-${len}.cts"; echo "${ofile}"
          cat lang/${sample}/voyn.wfr \
            | capitalize-ligatures -v field=3 \
            | factor-field-general \
                -f factor-text-${ekind}.gawk -v inField=3 -v outField=4 \
            | gawk '/./{ print $1, $4; }' \
            | extract-words-by-elem-count -v len=${len} \
            | sort -b +1 -2 \
            > ${ofile}
          wc ${ofile}
        end
      end
    end
    
COMPUTING THE LENGTH DISTRIBUTION OF ROMAN NUMERALS

  Computing the length distributions:

    foreach ekind ( old new )
      set ifile = ".roman-${ekind}.nums"
      set ofile = ".roman-${ekind}-length.cts"
      echo "${ifile} -> ${ofile}"
      cat ${ifile} \
        | gawk '//{ print length($1)-1; }' \
        | sort | uniq -c | expand \
        | sort -b +1 -2n \
        > ${ofile}
    end


  
