Hacking at the Voynich manuscript - Side notes
107 Computing and comparing word and token length distributions

Last edited on 2012-05-06 02:09:32 by stolfilocal

INTRODUCTION

  This note computes the token and word length distributions
  (in basic and OKOKOKO elements) and generates
  the plots that will go into the tech report.

SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../tr-stats/dat
    ln -s ../tr-stats/exp
    ln -s ../tr-stats/fig
    ln -s ../../../work 

    ln -s work/capitalize-ligatures
    ln -s work/compute-cum-cum-freqs
    ln -s work/compute-cum-freqs
    ln -s work/compute-elem-count-distrib
    ln -s work/compute-freqs
    ln -s work/combine-counts
    ln -s work/remove-freqs
    ln -s work/totalize-fields
    ln -s work/select-units
    ln -s work/words-from-evt
    ln -s work/format-counts-packed
    ln -s work/update-paper-include
    ln -s work/factor-field-general
    
    ln -s work/factor-text-trivial.gawk
    ln -s work/factor-text-viqr-to-phon.gawk
    ln -s work/factor-text-pinyin-to-phon.gawk
    ln -s work/factor-text-pinyin-std.gawk
    ln -s work/factor-text-pinyin-fix.gawk
    ln -s work/factor-text-eva-to-basic.gawk
    ln -s work/factor-text-eva-to-oko.gawk

COMPUTING WORD AND TOKEN LENGTH DISTRIBUTIONS
  
  Selecting the samples and length-defining encodings:
  
    sampelems=( \
      voyn/maj.bgly \
      voyn/prs.bgly \
      voyn/lab.bgly \
      \
      voyn/maj.qoko \
      voyn/prs.qoko \
      voyn/lab.qoko \
      \
      engl/wow.lets \
      engl/cul.lets \
      engl/twp.lets \
      latn/ptt.lets \
      latn/nwt.lets \
      latn/ock.lets \
      grek/nwt.lets \
      span/qvi.lets \
      fran/tal.lets \
      ital/psp.lets \
      port/csm.lets \
      germ/sim.lets \
      russ/pic.lets \
      russ/ptt.lets \
      arab/quf.jsar \
      arab/quv.jsar \
      arab/qud.jsar \
      arab/qph.jsar \
      arab/qcs.jsar \
      hebr/tav.jshb \
      hebr/tad.jshb \
      geez/gok.sera \
      \
      viet/ptt.viqr \
      viet/ptt.phon \
      \
      viet/nwt.viqr \
      viet/nwt.phon \
      \
      chin/ptt.stpy \
      chin/ptt.fxpy \
      chin/ptt.phon \
      \
      chin/ptn.stpy \
      chin/ptn.fxpy \
      chin/ptn.phon \
      \
      chin/red.stpy \
      chin/red.fxpy \
      chin/red.phon \
      \
      chin/voa.stpy \
      chin/voa.fxpy \
      chin/voa.phon \
      \
      chip/voa.stpy \
      chip/voa.fxpy \
      chip/voa.phon \
      \
      tibe/vim.acip \
      tibe/ccv.acip \
      tibe/pmi.acip \
      \
      enrc/wow.lets \
      envg/wow.lets \
      chrc/red.lets \
      \
      voyp/grs.bgly \
      voyp/grm.bgly \
      \
      viep/grs.viqr \
      viep/grs.phon \
      \
      viep/mky.viqr \
      viep/mky.phon \
      \
      engl/wnm.lets \
      engl/cpn.lets \
    )
    
  Defining the encoding functions:
  
    wft="../../../work/factor-text"
    ft="factor-text"
    
    ( cd dat/voyn/maj && ln -s ${wft}-eva-to-basic.gawk   ${ft}-to-bgly.gawk )
    ( cd dat/voyn/prs && ln -s ${wft}-eva-to-basic.gawk   ${ft}-to-bgly.gawk )
    ( cd dat/voyn/lab && ln -s ${wft}-eva-to-basic.gawk   ${ft}-to-bgly.gawk )

    ( cd dat/voyn/maj && ln -s ${wft}-eva-to-oko.gawk     ${ft}-to-qoko.gawk )
    ( cd dat/voyn/prs && ln -s ${wft}-eva-to-oko.gawk     ${ft}-to-qoko.gawk )
    ( cd dat/voyn/lab && ln -s ${wft}-eva-to-oko.gawk     ${ft}-to-qoko.gawk )

    ( cd dat/engl/wow && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/engl/cul && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/engl/twp && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/latn/ptt && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/latn/nwt && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/latn/ock && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/grek/nwt && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/span/qvi && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/fran/tal && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/ital/psp && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/port/csm && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/germ/sim && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/russ/pic && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/russ/ptt && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/arab/quf && ln -s ${wft}-trivial.gawk        ${ft}-to-jsar.gawk )
    ( cd dat/arab/quv && ln -s ${wft}-trivial.gawk        ${ft}-to-jsar.gawk )
    ( cd dat/arab/qud && ln -s ${wft}-trivial.gawk        ${ft}-to-jsar.gawk )
    ( cd dat/arab/qph && ln -s ${wft}-trivial.gawk        ${ft}-to-jsar.gawk )
    ( cd dat/arab/qcs && ln -s ${wft}-trivial.gawk        ${ft}-to-jsar.gawk )

    ( cd dat/hebr/tav && ln -s ${wft}-trivial.gawk        ${ft}-to-jshb.gawk )
    ( cd dat/hebr/tad && ln -s ${wft}-trivial.gawk        ${ft}-to-jshb.gawk )

    ( cd dat/geez/gok && ln -s ${wft}-trivial.gawk        ${ft}-to-sera.gawk )

    ( cd dat/viet/ptt && ln -s ${wft}-trivial.gawk        ${ft}-to-viqr.gawk )
    ( cd dat/viet/ptt && ln -s ${wft}-viqr-to-phon.gawk   ${ft}-to-phon.gawk )
    ( cd dat/viet/nwt && ln -s ${wft}-trivial.gawk        ${ft}-to-viqr.gawk )
    ( cd dat/viet/nwt && ln -s ${wft}-viqr-to-phon.gawk   ${ft}-to-phon.gawk )
    
    ( cd dat/chin/ptt && ln -s ${wft}-pinyin-std.gawk     ${ft}-to-stpy.gawk )
    ( cd dat/chin/ptt && ln -s ${wft}-pinyin-fix.gawk     ${ft}-to-fxpy.gawk )
    ( cd dat/chin/ptt && ln -s ${wft}-pinyin-to-phon.gawk ${ft}-to-phon.gawk )
    
    ( cd dat/chin/ptn && ln -s ${wft}-pinyin-std.gawk     ${ft}-to-stpy.gawk )
    ( cd dat/chin/ptn && ln -s ${wft}-pinyin-fix.gawk     ${ft}-to-fxpy.gawk )
    ( cd dat/chin/ptn && ln -s ${wft}-pinyin-to-phon.gawk ${ft}-to-phon.gawk )
    
    ( cd dat/chin/red && ln -s ${wft}-pinyin-std.gawk     ${ft}-to-stpy.gawk )
    ( cd dat/chin/red && ln -s ${wft}-pinyin-fix.gawk     ${ft}-to-fxpy.gawk )
    ( cd dat/chin/red && ln -s ${wft}-pinyin-to-phon.gawk ${ft}-to-phon.gawk )

    ( cd dat/chin/voa && ln -s ${wft}-pinyin-std.gawk     ${ft}-to-stpy.gawk )
    ( cd dat/chin/voa && ln -s ${wft}-pinyin-fix.gawk     ${ft}-to-fxpy.gawk )
    ( cd dat/chin/voa && ln -s ${wft}-pinyin-to-phon.gawk ${ft}-to-phon.gawk )

    ( cd dat/chip/voa && ln -s ${wft}-pinyin-std.gawk     ${ft}-to-stpy.gawk )
    ( cd dat/chip/voa && ln -s ${wft}-pinyin-fix.gawk     ${ft}-to-fxpy.gawk )
    ( cd dat/chip/voa && ln -s ${wft}-pinyin-to-phon.gawk ${ft}-to-phon.gawk )

    ( cd dat/tibe/vim && ln -s ${wft}-trivial.gawk        ${ft}-to-acip.gawk )
    ( cd dat/tibe/ccv && ln -s ${wft}-trivial.gawk        ${ft}-to-acip.gawk )
    ( cd dat/tibe/pmi && ln -s ${wft}-trivial.gawk        ${ft}-to-acip.gawk )

    ( cd dat/chrc/red && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/enrc/wow && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/envg/wow && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )

    ( cd dat/envt/wow && ln -s ${wft}-trivial.gawk        ${ft}-to-viqr.gawk )
    ( cd dat/envt/wow && ln -s ${wft}-viqr-to-phon.gawk   ${ft}-to-phon.gawk )

    ( cd dat/voyp/grs && ln -s ${wft}-eva-to-basic.gawk   ${ft}-to-bgly.gawk )
    ( cd dat/voyp/grm && ln -s ${wft}-eva-to-basic.gawk   ${ft}-to-bgly.gawk )

    ( cd dat/viep/grs && ln -s ${wft}-trivial.gawk        ${ft}-to-viqr.gawk )
    ( cd dat/viep/grs && ln -s ${wft}-viqr-to-phon.gawk   ${ft}-to-phon.gawk )

    ( cd dat/viep/mky && ln -s ${wft}-trivial.gawk        ${ft}-to-viqr.gawk )
    ( cd dat/viep/mky && ln -s ${wft}-viqr-to-phon.gawk   ${ft}-to-phon.gawk )

    ( cd dat/engl/wnm && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )
    ( cd dat/engl/cpn && ln -s ${wft}-trivial.gawk        ${ft}-to-lets.gawk )

  Checking if the links are OK:
  
    for se in ${sampelems[@]} ; do
      sample="${se%.*}"
      elem="${se##*.}"
      ls -lL dat/${sample}/factor-text-to-${elem}.gawk
    done

  Computing the Voynichese token and word length histograms in terms of basic
  and OKOKOKO glyphs. We also generate comparison plots of good and raw
  token lists, in order to ensure that the good-word selection did not
  introduce any significant bias in the length statistics.

    make -f vms-length-hists.make all

COMPARING TOKEN LENGTH DISTRIBUTIONS WITH OTHER LANGS
  
  Computing the token and word length distributions of 
  other languages of interest: 

    for se in ${sampelems[@]} ; do
      sample=${se%.*}; elem=${se##*.}
      make LANG=${sample%/*} BOOK=${sample##*/} QUAL=gud ELEM=${elem} \
        -f tw-length-hists.make all
    done

  Summarizing average token lengths:
  
    for se in  ${sampelems[@]} ; do
      sample=${se%.*}; elem=${se##*.}
      printf "%s %-8s " ${sample} ${elem}
      for tkwd in  t w ; do
        afile="dat/${sample}/tot.1/gud-fact-${elem}-${tkwd}-avlen.tex"
        avg="`cat ${afile} | sed -e 's:^.*{::' -e 's:}.*::'`"
        printf " %7s" "${avg}"
      done
      printf "\n"
    done

      voyn/maj bgly         4.47    5.58
      voyn/prs bgly         4.45    5.54
      voyn/lab bgly         5.08    5.44
      voyn/maj qoko         3.78    4.81
      voyn/prs qoko         3.75    4.76
      voyn/lab qoko         4.61    4.93
      engl/wow lets         4.43    6.92
      engl/cul lets         4.28    6.53
      engl/twp lets         3.65    5.41
      latn/ptt lets         5.44    7.73
      latn/nwt lets         5.14    7.68
      latn/ock lets         5.94    8.24
      grek/nwt lets         4.85    7.65
      span/qvi lets         4.33    7.42
      fran/tal lets         4.74    7.67
      ital/psp lets         4.65    7.59
      port/csm lets         4.46    7.38
      germ/sim lets         5.10    7.80
      russ/pic lets         5.34    8.03
      russ/ptt lets         4.64    7.03
      arab/quf jsar         7.72    9.39
      arab/quv jsar         7.15    8.70
      arab/qud jsar         4.47    5.50
      arab/qph jsar         6.53    8.20
      arab/qcs jsar         4.51    5.62
      hebr/tav jshb         8.55   10.13
      hebr/tad jshb         5.44    6.57
      geez/gok sera         6.82    7.79
      viet/ptt viqr         4.36    4.68
      viet/ptt phon         2.73    2.91
      viet/nwt viqr         4.42    4.70
      viet/nwt phon         2.73    2.92
      chin/ptt stpy         3.62    3.98
      chin/ptt fxpy         3.58    3.95
      chin/ptt phon         3.53    3.75
      chin/ptn stpy         3.59    3.96
      chin/ptn fxpy         3.55    3.93
      chin/ptn phon         3.51    3.73
      chin/red stpy         3.73    4.03
      chin/red fxpy         3.68    3.99
      chin/red phon         3.63    3.77
      chin/voa stpy         3.88    3.99
      chin/voa fxpy         3.84    3.95
      chin/voa phon         3.68    3.74
      chip/voa stpy         3.72    3.90
      chip/voa fxpy         3.68    3.88
      chip/voa phon         3.68    3.82
      tibe/vim acip         3.53    4.30
      tibe/ccv acip         3.29    4.13
      tibe/pmi acip         3.82    4.34
      enrc/wow lets         5.26    7.45
      envg/wow lets         4.43    6.18
      envt/wow viqr         4.61    4.71
      envt/wow phon         2.85    2.93
      chrc/red lets         4.95    5.59
      voyp/grs bgly         4.89    5.40
      voyp/grm bgly         4.40    5.03
      viep/grs viqr         4.68    5.18
      viep/grs phon         2.47    2.49
      viep/mky viqr         4.40    5.12
      viep/mky phon         2.75    3.05
      engl/wnm lets         7.16    7.22
      engl/cpn lets         6.11    6.46

  Plotting the distributions of texts and labels

    generate-all-lang-plots.sh 

COMPARISON WITH BINOMIAL DISTRIBUTIONS

    generate-all-binom-plots.sh