Hacking at the Voynich manuscript - Side notes
102 Tabulating the most popular lexemes in prose text and labels

Last edited on 2025-05-04 16:42:32 by stolfi

INTRODUCTION

  Here we tabulate the most popular lexemes in the prose text and labels, 
  globally and by section.

SETTING UP THE ENVIRONMENT

  Links:

    ln -s ../tr-stats/dat
    ln -s ../tr-stats/tex
    ln -s ../../../voynich/work 

    ln -s work/capitalize-ligatures
    ln -s work/compute-cum-freqs
    ln -s work/update-paper-include
    
  This note uses the "bash" shell.

TABULATING THE MOST POPULAR LEXEMES

  Make sure that every sample has at least a trivial tex-encoding function:
  
    export ftri="reencode-words-trivial"
    export ftex="reencode-words-for-tex"
    for smpdir in dat/????/??? ; do
      filter="${smpdir}/${ftex}"
      if [[ ! -x ${filter} ]]; then
        echo "linking ${filter} -> work/${ftri}" 1>&2
        ( cd $smpdir && ln -s work/${ftri} ${ftex} )
      fi
    done

  Formatted lists of most popular lexemes, for TeX report:
  
    sampsizes=( \
      voyn/prs:80 \
      voyn/prs:60 \
      voyn/prs:40 \
      voyn/prs:24 \
      voyn/prs:16 \
      \
      voyn/lab:40 \
      voyn/lab:24 \
      voyn/lab:16 \
      \
      engl/wow:60 \
      engl/cul:60 \
      engl/twp:60 \
      latn/ptt:60 \
      latn/nwt:60 \
      latn/ock:60 \
      grek/nwt:60 \
      span/qvi:60 \
      ital/psp:60 \
      fran/tal:60 \
      port/csm:60 \
      germ/sim:60 \
      russ/pic:60 \
      russ/ptt:60 \
      arab/quf:60 \
      arab/quv:60 \
      arab/qud:60 \
      arab/qph:60 \
      arab/qcs:60 \
      hebr/tav:60 \
      hebr/tad:60 \
      geez/gok:60 \
      viet/ptt:60 \
      viet/nwt:60 \
      chin/ptt:60 \
      chin/ptn:60 \
      chin/red:60 \
      chin/voa:60 \
      chip/voa:60 \
      tibe/vim:60 \
      tibe/ccv:60 \
      tibe/pmi:60 \
      chrc/red:60 \
      enrc/wow:60 \
      envt/wow:60 \
      envg/wow:60 \
      \
      voyp/grs:40 \
      voyp/grm:40 \
      viep/grs:40 \
      viep/mky:40 \
      \
      engl/cpn:40 \
      engl/wnm:40 \
      \
      voyn/ini:40 \
      voyn/mid:40 \
      voyn/fin:40 \
    )
  
  Formatted "N top" lexeme lists for TeX report:
  
    for sn in ${sampsizes[@]} ; do
      sna=( ${sn/:/ } );
      sample="${sna[0]}"; nlexemes="${sna[1]}";
      tfile="${sample}/tot.1/top-${nlexemes}-words.tex"
      echo "sample = ${sample}  nlexemes = ${nlexemes}  tex file = ${tfile}" 1>&2
      ./get_top_words_per_section.sh ${nlexemes} ${sample}/tot.1 \
        | gawk '/./{ print $1, $2, $5; }' \
        | dat/${sample}/reencode-words-for-tex -v field=3 \
        | ./tex_format_word_freqs_by_section.gawk \
            -v ncols=4 \
            -v showCounts=1 -v showFreqs=1 \
        > dat/${tfile}
      cat dat/${tfile}
      update_paper_include.sh dat/${tfile} tex/${tfile}
    done
    
  Formatted "24 bottom" and "40 bottom" lexeme lists for TeX report:
  
    for nlexemes in 24 40 ; do
      for sn in ${sampsizes[@]} ; do
        sna=( ${sn/:/ } )
        sample="${sna[0]}";
        tfile="${sample}/tot.1/bot-${nlexemes}-words.tex";
        echo "sample = ${sample}  nlexemes = ${nlexemes}  tex file = ${tfile}" 1>&2
        ./get_bot_words.sh ${nlexemes} ${sample}/tot.1 \
          | gawk '/./{ print $1, $2, $5; }' \
          | dat/${sample}/reencode-words-for-tex -v field=3 \
          | ./tex_format_word_freqs_by_section.gawk \
              -v ncols=8 \
              -v showCounts=0 -v showFreqs=0 \
          > dat/${tfile}
        cat dat/${tfile}
        update_paper_include.sh dat/${tfile} tex/${tfile}
      done
    done
    
  Formatted per-section "top 24" or "top 16" lexeme lists:
  
    make_tex_top_words_by_section.sh

#END