Hacking at the Voynich manuscript - Side notes
101 Preparing clean samples of various other languages 

Last edited on 2002-01-17 01:43:18 by stolfi

SUMMARY

  Here we prepare text samples in English, Latin, and other languages,
  comparable in size to the Voynichese reference sample, for the
  statistical analyses that will go into the "word structure"
  technical report.

!!!  TO DO:
!!!   Check spaces in Ethiopian sample.

SETTING UP THE ENVIRONMENT

  Links:

    ln -s ../../compute-freqs
    ln -s ../../update-paper-include

    ln -s ../100/sample
    ln -s ../../Texts

  Paper directories:

    set tbldir = "/home/staff/stolfi/papers/voynich-stats/techrep/tables/auto"
    set figdir = "/home/staff/stolfi/papers/voynich-stats/techrep/figures/auto"

CHOOSING THE WORD SAMPLES FROM OTHER LANGUAGES

  Get number of tokens in Voynichese reference sample 
  (plain prose and labels):

    foreach lang ( voyp voyl voyn )
      cat sample/${lang}/vms/tot.t/gud.wfr \
        | gawk '/./{s+=$1} END{print s}' \
        > .tmp
      set n${lang} = `cat .tmp`
    end 
    echo $nvoyp $nvoyl $nvoyn
    
  Define the set of language samples, extracted from the "Texts" database:
  
     lang/buk   Language     Encoding   Size  Text source
     --------   -----------  ---------  ----  ----------------------------
     engl/wow   English      English    nTxt  War of the Worlds [WoW] 
     engl/cul   English      English    nTxt  Culpeper's Herbal [CH]
     latn/ptt   Latin        Latin      nTxt  Pentateuch (Vulgate) [Ptt-L]
     grek/nwt   Greek        JSGreek    nTxt  New Testament (Byzantine) [NwT-G]
     span/qvi   Spanish      Spanish    nTxt  Don Quijote [DQui]
     geez/gok   Ethiopian    SERA       nTxt  Glory of the Kings [GoK]
     viet/ptt   Vietnamese   VIQR       nTxt  Pentateuch (Cadman) [Ptt-V]
     chin/ptt   Chinese      Pinyin     nTxt  Pentateuch (Union) [Ptt-C]
     chin/red   Chinese      Pinyin     nTxt  Dream of Red Mansion [DoRM]
     tibe/vim   Tibetan      ACIP       nTxt  Vimalakirti Sutra [VS]
     tibe/ccv   Tibetan      ACIP       nTxt  Com. on Com. on Valid Reas. [CCVR]
                                            
     enrm/wow   English      RomanNum   nTxt  [WoW] text in "Roman code"
     chrm/red   Chinese      RomanNum   nTxt  [DoRM] text in "Roman code"
                                            
     engn/wow   EnglNames    English    nLab  Proper names from [WoW]
     engp/cul   EnglPlants   English    nLab  Plant names from [CH]

  where nTxt = ${nvoyp}, nLab = ${nvoyl}

    set sampsecs = ( \
      engl/wow.${nvoyp}/bod.1 \
      engl/cul.${nvoyp}/pre.1,her.1,rec.1 \
      latn/ptt.${nvoyp}/gen.1,exo.1,num.1,lev.1,deu.1 \
      grek/nwt.${nvoyp}/mat.1,mrk.1,luk.1,joh.1 \
      span/qvi.${nvoyp}/bod.1 \
      geez/gok.${nvoyp}/bod.1 \
      viet/ptt.${nvoyp}/bod.1 \
      tibe/vim.${nvoyp}/bod.1 \
      tibe/ccv.${nvoyp}/bod.1 \
      chin/ptt.${nvoyp}/bod.1 \
      chin/red.${nvoyp}/bod.1 \
      \
      enrc/wow.${nvoyp}/bod.1 \
      chrc/red.${nvoyp}/bod.1 \
      \
      engn/wow.${nvoyl}/bod.1 \
      engp/cul.${nvoyl}/bod.1 \
    )
    
  List of sample dirnames without sections:
    
    set samples = ( `echo ${sampsecs} | tr ' ' '\012' | sed -e 's:[.].*$::g'` )
    set samplescm = `echo ${samples} | tr ' ' ','`
    echo "${samplescm}"

  Create directories if needed, and ensure presence of the 
  source links, and word-mapping tables for fix-raw-words: 

    foreach sample ( ${samples} )
      foreach pd ( sample ${tbldir} ${figdir} )
        set dir = "${pd}/${sample}"
        if ( ! ( -d ${dir} ) ) mkdir -p ${dir}
      end
      set tfile = "sample/${sample}/word-map.tbl"
      if ( ! ( -r "${tfile}" ) ) echo "*** missing ${tfile}"
      set sfile = "sample/${sample}/source/main.evt"
      ls -ld ${sfile}
      if ( ! ( -r "${sfile}" ) ) echo "*** missing ${sfile}"
      set gfile = "sample/${sample}/sample-fns.gawk"
      if ( ! ( -r "${gfile}" ) ) echo "*** missing ${gfile}"
    end

  Create section lists for each sample, and the corresponding directories:

    foreach sampsec ( ${sampsecs} )
      set smsz = ${sampsec:h}
      set secscm = "`echo ${sampsec:t}`"
      set sample = ${smsz:r}
      set size = ${smsz:e}
      set secs = ( `echo ${secscm} | tr ',' ' '` )
      echo "${sample} size = ${size} sections = ( ${secs} tot.t )"
      echo "${secs}" | tr ' ' '\012' > sample/${sample}/subsections.tags
      echo "${size}" > sample/${sample}/gud.num
      cp -p sample/${sample}/subsections.tags sample/${sample}/subsections-ok.tags
      foreach sec ( ${secs} tot.t )
        foreach pd ( sample ${tbldir} ${figdir} )
          set dir = "${pd}/${sample}/${sec}"
          if ( ! ( -d ${dir} ) ) mkdir ${dir}
        end
        echo "999999" > ${dir}/whole-raw.num
      end
    end

      engl/wow size = 35027 sections = ( bod.1 tot.t )
      engl/cul size = 35027 sections = ( pre.1 her.1 rec.1 tot.t )
      latn/ptt size = 35027 sections = ( gen.1 exo.1 num.1 lev.1 deu.1 tot.t )
      grek/nwt size = 35027 sections = ( mat.1 mrk.1 luk.1 joh.1 tot.t )
      span/qvi size = 35027 sections = ( bod.1 tot.t )
      geez/gok size = 35027 sections = ( bod.1 tot.t )
      viet/ptt size = 35027 sections = ( bod.1 tot.t )
      tibe/vim size = 35027 sections = ( bod.1 tot.t )
      tibe/ccv size = 35027 sections = ( bod.1 tot.t )
      chin/ptt size = 35027 sections = ( bod.1 tot.t )
      chin/red size = 35027 sections = ( bod.1 tot.t )
      enrc/wow size = 35027 sections = ( bod.1 tot.t )
      chrc/red size = 35027 sections = ( bod.1 tot.t )
      engn/wow size = 1003 sections = ( bod.1 tot.t )
      engp/cul size = 1003 sections = ( bod.1 tot.t )

OBTAINING THE RAW SAMPLE TEXTS

  Cleanup of derived files:

    # foreach sample ( ${samples} )
    #   set secs = ( `cat sample/${sample}/subsections-ok.tags` tot.t )
    #   foreach sec ( ${secs} )
    #     /bin/rm sample/${sample}/${sec}/{trunc-raw.num,raw.evt,*.{tks,wfr}}
    #   end
    # end

  The number of raw tokens taken for each sample is adjusted so that
  the number good tokens matches (as far as possible) the number of
  good tokens in the "voyp" or "voyl" samples, as appropriate.
  
    echo 35040 > sample/engl/wow/bod.1/trunc-raw.num
    
    echo  3200 > sample/engl/cul/pre.1/trunc-raw.num 
    echo 23054 > sample/engl/cul/her.1/trunc-raw.num
    echo  8985 > sample/engl/cul/rec.1/trunc-raw.num
    
    echo  9118 > sample/latn/ptt/gen.1/trunc-raw.num
    echo  7254 > sample/latn/ptt/exo.1/trunc-raw.num
    echo  4981 > sample/latn/ptt/lev.1/trunc-raw.num
    echo  6984 > sample/latn/ptt/num.1/trunc-raw.num
    echo  6690 > sample/latn/ptt/deu.1/trunc-raw.num
    
    echo  9921 > sample/grek/nwt/mat.1/trunc-raw.num
    echo  6156 > sample/grek/nwt/mrk.1/trunc-raw.num
    echo 10525 > sample/grek/nwt/luk.1/trunc-raw.num
    echo  8425 > sample/grek/nwt/joh.1/trunc-raw.num
    
    echo 35041 > sample/span/qvi/bod.1/trunc-raw.num

    echo 35307 > sample/geez/gok/bod.1/trunc-raw.num

    echo 35027 > sample/viet/ptt/bod.1/trunc-raw.num

    echo 35048 > sample/tibe/vim/bod.1/trunc-raw.num

    echo 35041 > sample/tibe/ccv/bod.1/trunc-raw.num

    echo 35027 > sample/chin/ptt/bod.1/trunc-raw.num

    echo 35027 > sample/chin/red/bod.1/trunc-raw.num

    echo 35027 > sample/chrc/red/bod.1/trunc-raw.num

    echo 35040 > sample/enrc/wow/bod.1/trunc-raw.num

    echo  1003 > sample/engn/wow/bod.1/trunc-raw.num

    echo  1003 > sample/engp/cul/bod.1/trunc-raw.num

  Obtain sample texts, suitable for analysis, in EVMT format.
  Extract the right number of raw tokens from the EVMT text,
  then extract the gud and bad tokens, and count all kinds
  of words in all sections:

    foreach sample ( ${samples} )
      foreach sizetag ( whole trunc )
        get-sample-files ${sample} ${sizetag}
        set ffile = "sample/${sample}/tot.t/gud.wfr"
        set tfile = "${sample}/${sizetag}-summary.tex"
        set exfile = ${sample:h}-${sample:t}-${sizetag}-summary.tex
        if ( -r ${ffile} ) then
          tex-make-sample-summary ${sample} ${sizetag} > sample/${tfile}
          cat sample/${tfile}
          update-paper-include sample/${tfile} ${tbldir}/${tfile}
        else
           echo "*** no ${ffile} for ${sizetag}, ${tfile} not created"
        endif
      end
      dicio-wc sample/${sample}/{source/main,{${secscm}}/raw}.evt
    end

  Tabulate counts of tokens and words:

    foreach kind ( raw gud bad )
      /bin/rm -f .summary-${kind}
      echo ' '
      echo "  Counts for ${kind} text"
      echo "  sample    tokens   words"
      echo "  -------- ------- -------"
      foreach sample ( ${samples} )
        set ifile = "sample/${sample}/tot.t/${kind}.wfr";
        cat ${ifile} \
          | gawk -v sample=${sample} \
              ' /./{ w++; t+=$1;} \
                END{ printf "  %s %7d %7d\n",sample,t,w;} \
              ' \
          >> .summary-${kind}
      end
      cat .summary-${kind}
    end

      Counts for raw text
      sample    tokens   words
      -------- ------- -------
      engl/wow   35040    4874
      engl/cul   35239    3816
      latn/ptt   35027    6634
      grek/nwt   35027    5437
      span/qvi   35041    5453
      geez/gok   35126   12336
      viet/ptt   35027    1706
      tibe/vim   35048    1315
      tibe/ccv   35041     854
      chin/ptt   35027    1376
      chin/red   35027    2420
      enrc/wow   35040    4874
      chrc/red   35027    2420
      engn/wow    1003     234
      engp/cul     983     485

      Counts for gud text
      sample    tokens   words
      -------- ------- -------
      engl/wow   35027    4869
      engl/cul   35027    3811
      latn/ptt   35027    6634
      grek/nwt   35027    5437
      span/qvi   35027    5452
      geez/gok   34849   12324
      viet/ptt   35027    1706
      tibe/vim   35027    1312
      tibe/ccv   35027     845
      chin/ptt   35027    1376
      chin/red   35027    2420
      enrc/wow   35027    4869
      chrc/red   35027    2420
      engn/wow    1003     234
      engp/cul     979     484

      Counts for bad text
      sample    tokens   words
      -------- ------- -------
      engl/wow      13       5
      engl/cul     212       5
      latn/ptt       0       0
      grek/nwt       0       0
      span/qvi      14       1
      geez/gok     277      12
      viet/ptt       0       0
      tibe/vim      21       3
      tibe/ccv      14       9
      chin/ptt       0       0
      chin/red       0       0
      enrc/wow      13       5
      chrc/red       0       0
      engn/wow       0       0
      engp/cul       4       1


# NOT DONE #############################################################
# 
# ROMAN NUMERALS
# 
#   Generating the Roman numerals 0-999, additive system:
#   [Now replaced by roman-coded texts such as enrc/wow and 
#   chrc/red]
#   
#     /bin/rm .roman-old.nums
#     foreach u ( '' I II III IIII V VI VII VIII VIIII )
#       foreach d ( '' X XX XXX XXXX L LX LXX LXXX LXXXX )
#         foreach c ( '' C CC CCC CCCC D DC DCC DCCC DCCCC )
#           echo "#$c$d$u" >> .roman-old.nums
#         end
#       end
#     end
# 
#   Generating the Roman numerals 0-999, subtractive system:
# 
#     /bin/rm .roman-new.nums
#     foreach u ( '' I II III IV V VI VII VIII IX )
#       foreach d ( '' X XX XXX XL L LX LXX LXXX XC )
#         foreach c ( '' C CC CCC CD D DC DCC DCCC CM )
#           echo "#$c$d$u" >> .roman-new.nums
#         end
#       end
#     end
# 
# 
# JUNK
#     
#   Generating the TeX-formatted summary:
#   [Now superseded by per-language summaries]
#   
#     foreach kind ( raw gud bad )
#       set ifile = ".summary-${kind}"
#       set tfile = "otherlangs-${kind}-tw-summary.tex"
#       printf "%% Created by Note-101.txt\n" > ${tfile}
#       printf "%%\n" >> ${tfile}
#       cat ${ifile} \
#         | gawk \
#             ' /./ { \
#                 smp = $1; tks = $2; wds = $3; \
#                 gsub(/[\/]/, "", smp); \
#                 printf "\\def\\%sGudTks{%d}\n",smp,tks; \
#                 printf "\\def\\%sGudWds{%d}\n",smp,wds; \
#               } \
#             ' \
#         >> ${tfile}
#       update-paper-include ${tfile} ${tbldir}/
#     end
# 
#   Take a small sample from each sampleuage, and pretend it is labels:
#   [Now replaced by more realistic samples such as engn/wow and engp/cul]
# 
#     set ratio = \
#       `gawk -v nprose=${nprose} -v nlabs=${nlabs} 'BEGIN{print nlabs/(nprose - 2*nlabs);}'`
#     echo ${ratio}
#     
#     foreach lp ( eng.0 lat.0 )
#       set lng = "${lp:r}"; set plus = "${lp:e}"
#       @ ntake = ${nlabs} + ${plus}
#       set ifile = "sample/${lng}/prose/raw.tks"
#       set ofile = "sample/${lng}/labs/raw.tks"
#       echo "${ifile} -> ${ofile}"
#       cat ${ifile} \
#         | gawk -v ratio=${ratio} '(rand() <= ratio){ print; }' \
#         | head -${ntake} \
#         > ${ofile}
#       cat ${ofile} | egrep -v '[^a-z]' > .gud
#       dicio-wc ${ofile} .gud
#     end

# END
