Hacking at the Voynich manuscript - Side notes
066 Looking for maximal repeated sequences in various langs

Last edited on 2004-09-13 19:38:26 by stolfi

  Marke Fincher has been looking at maximal repeated substrings
  in the VMS. Here I do the same for other languages.

LINK SETUP

  ln -s /home/staff/stolfi/voynich/work
  ln -s ../101/dat
  
COLLECTING THE REPEATED WORDS

  set samples = ( chip/voa/tot.1 viet/ptt/tot.1 )
  set maxwds = 3000
  set maxlen = 50
  
  mkdir rep
  
  foreach s ( ${samples} )
    mkdir -p rep/${s}
    echo "${s}"
    cat dat/${s}/raw.tlw \
      | head -${maxwds} \
      | gawk '($1 == "a"){ print $3; next; } // { print "*"; }' \
      | find-repeated-substrings \
          -v maxlen=${maxlen} \
          -v wsep=':' \
      | revbytes | sort | revbytes \
      > rep/${s}/raw.rep
  end