Hacking at the Voynich manuscript - Side notes
068 Language transducers with specified output freqs

Last edited on 2004-10-05 13:06:25 by stolfi

LINK AND DIRECTORY SETUP

  ln -s /home/staff/stolfi/voynich/work
  
  ln -s ../101/dat dat

  mkdir res
  
VOYNICHESE SAMPLE
  
  Got some texts:
  
    set samples = ( \
      voyn/maj/tot.1 \
      engl/wow/tot.1 \
      ital/psp/tot.1 \
      chip/voa/tot.1 \
    )
  
    foreach smp ( ${samples} )
      mkdir -p res/${smp}
    end

GATHERING THE TEXTS
  
  Gather good words from source files, write them one word per line.
  Discard bad chars and "words" starting with digits (beware of pinyin
  tones). Map parag breaks to blank lines.
    
    foreach smp ( ${samples} )
      set infile = dat/${smp}/raw.tlw
      set otfile = res/${smp}/gud.tks
      echo "=== ${infile} -> ${otfile} ===" 
      cat ${infile} \
        | gawk \
            ' /^[a]/ { print $3; } \
              /^[#] *[=]$/ { print ""; } \
              /^[p].* [=]$/ { print ""; } \
            ' \
        | egrep -v -e '[?*]' \
        | egrep -v -e '^[0-9]' \
        > ${otfile} 
      head -200 ${otfile} | fmt -w 72
    end
    
COMPUTING LETTER FREQUENCIES

  Computing table of letter frequencies, sorted,
  for each language:
  
    foreach smp ( ${samples} )
      set infile = res/${smp}/gud.tks
      set otfile = res/${smp}/gud.lfr
      echo "=== ${infile} -> ${otfile} ===" 
      cat ${infile} \
        | gather-letter-freqs \
        | sort -b +0 -1nr \
        > ${otfile} 
      cat ${otfile}
    end
    
BUILDING MARKOV MODELS
  
  Collecting the Markov transition probabilities:

    set orders = ( 0 1 2 3 )
    
    foreach smp ( ${samples} )
      foreach ord ( ${orders} )
        set infile = res/${smp}/gud.tks
        set otfile = res/${smp}/gud-${ord}.pfr
        echo "=== ${infile} -> ${otfile} ===" 
        cat ${infile} \
          | gather-transition-freqs \
              -v order=${ord} \
          | sort -b +2 -3 +0 -1nr \
          > ${otfile} 
        head -100 ${otfile}
      end
    end

TRANSDUCTION

  We read the letter frequency table of language A, the letter pair
  frequency table of language B, combine them into a transducer, and
  apply it to the sample of text A.

    set pairs = ( \
      engl/wow/tot.1,ital/psp/tot.1 \
      ital/psp/tot.1,engl/wow/tot.1 \
      voyn/maj/tot.1,engl/wow/tot.1 \
      voyn/maj/tot.1,chip/voa/tot.1 \
      voyn/maj/tot.1,ital/psp/tot.1 \
    )
    
    foreach p ( ${pairs} )
      foreach ord ( ${orders} )
        set pp = ( `echo $p | tr ',' ' '` )
        set asmp = "${pp[1]}"; set bsmp = "${pp[2]}"
        set atext = res/${asmp}/gud.tks
        set alcts = res/${asmp}/gud.lfr
        set bpcts = res/${bsmp}/gud-${ord}.pfr
        set xtext = res/${asmp}/syn/${bsmp}/syn-${ord}.tks
        if ( ! ( -d ${xtext:r} ) ) mkdir -p ${xtext:r}
        echo "=== ${atext} ( ${bpcts} ) -> ${xtext} ===" 
        cat ${atext} \
          | transducer \
              -v order=${ord} \
              -v achfile=${alcts} \
              -v bprfile=${bpcts} \
          > ${xtext} 
        head -200 ${xtext} | fmt -w 72
      end
    end