Hacking at the Voynich manuscript - Side notes
102 Tabulating the most popular words and labels

Last edited on 2002-01-17 02:38:56 by stolfi

INTRODUCTION

  Here we tabulate the most popular words and labels, 
  globally and by section.

SETTING UP THE ENVIRONMENT

  Links:

    ln -s ../../capitalize-ligatures
    ln -s ../../compute-cum-freqs
    ln -s ../../combine-counts
    ln -s ../../update-paper-include

    ln -s ../100/sample

  Paper directories:

    set tbldir = "/home/staff/stolfi/papers/voynich-stats/techrep/tables/auto"
    set figdir = "/home/staff/stolfi/papers/voynich-stats/techrep/figures/auto"

TABULATING THE MOST POPULAR WORDS

  Make sure that every sample has at least a trivial tex-encoding function:
  
    set trivial = "/home/staff/stolfi/voynich/work/reencode-words-trivial"
    foreach smpdir ( sample/????/??? )
      set filter = "${smpdir}/reencode-words-for-tex"
      if ( ! -x ${filter} ) then
        echo "linking ${filter} -> ${trivial:t}"
        ( cd $smpdir && ln -s ${trivial} ${filter:t} )
      endif
    end

  Formatted lists of most popular words, for TeX report:
  
    set sampsizes = ( \
      voyp/vms.80 \
      \
      engl/wow.60 \
      engl/cul.60 \
      latn/ptt.60 \
      grek/nwt.60 \
      geez/gok.60 \
      span/qvi.60 \
      viet/ptt.60 \
      chin/ptt.60 \
      chin/red.60 \
      tibe/vim.60 \
      tibe/ccv.60 \
      chrc/red.60 \
      enrc/wow.60 \
      \
      voyl/vms.40 \
      engp/cul.40 \
      engn/wow.40 \
      \
      voyi/vms.40 \
      voyf/vms.40 \
      voym/vms.40 \
    )
  
  Formatted "N top" word lists for TeX report:
  
    foreach sn ( ${sampsizes} )
      set sample = "${sn:r}"; set nwords = "${sn:e}"
      set tfile = "${sample}/tot.t/top-${nwords}-words.tex"; echo "${tfile}"
      get-top-words ${nwords} ${sample}/tot.t \
        | gawk '/./{ print $1, $2, $5; }' \
        | sample/${sample}/reencode-words-for-tex -v field=3 \
        | tex-format-word-freqs \
            -v ncols=4 \
            -v showCounts=1 -v showFreqs=1 \
        > sample/${tfile}
      cat sample/${tfile}
      update-paper-include sample/${tfile} ${tbldir}/${tfile}
    end
    
  Formatted "40 bottom" word lists for TeX report:
  
    foreach sn ( ${sampsizes} )
      set sample = "${sn:r}";
      set nwords = 40
      set tfile = "${sample}/tot.t/bot-${nwords}-words.tex"; echo "${tfile}"
      get-bot-words ${nwords} ${sample}/tot.t \
        | gawk '/./{ print $1, $2, $5; }' \
        | sample/${sample}/reencode-words-for-tex -v field=3 \
        | tex-format-word-freqs \
            -v ncols=8 \
            -v showCounts=0 -v showFreqs=0 \
        > sample/${tfile}
      cat sample/${tfile}
      update-paper-include sample/${tfile} ${tbldir}/${tfile}
    end
    
  Formatted per-section "top 25" word lists:
  
    set sampsecs = ( \
      voyp/vms/pha,hea,heb,cos,str,zod,bio.25 \
      latn/ptt/gen,exo,lev,num,deu.15 \
      grek/nwt/mat,mrk,luk,joh.15 \
      engl/cul/pre,her,rec.15 \
    )
    
    foreach ss ( ${sampsecs} )
      set sample = "${ss:h}"
      set secnum = "${ss:t}"
      set nwords = "${secnum:e}"
      set mainsecs = ( `echo ${secnum:r} | tr ',' ' '` )
      set tfile = "${sample}/top-${nwords}-words-per-section.tex"; echo "${tfile}"
      get-top-words-per-section ${nwords} ${sample} ${mainsecs} \
        | gawk '/./{ print $1, $2, $3, $6; }' \
        | sample/${sample}/reencode-words-for-tex -v field=4 \
        | tex-format-word-freqs-by-section \
            -v showCounts=0 -v showFreqs=1 \
        > sample/${tfile}
      cat sample/${tfile}
      update-paper-include sample/${tfile}  ${tbldir}/${tfile}
    end

#END
