Hacking at the Voynich manuscript - Side notes
066 Looking for maximal repeated sequences in various langs

Last edited on 2025-09-24 15:14:12 by stolfi

  Marke Fincher has been looking at maximal repeated substrings
  in the VMS. Here I do the same for other languages.

LINK SETUP

  ln -s /home/staff/stolfi/voynich/work
  ln -s ../101/dat
  
COLLECTING THE REPEATED WORDS

  set samples = ( engl/cul latn/ahl chin/red chip/voa viet/ptt tibe/ccv )
  set maxwds = 10000
  set maxlen = 50
  
  mkdir rep
  
  foreach s ( ${samples} )
    echo "=== ${s} ==="
    wfile="langbank/${s}/main.wds
    mkdir -p rep/${s}
    cat dat/${s}/raw.tlw \
      | head -${maxwds} \
      | gawk '($1 == "a"){ print $3; next; } // { print "*"; }' \
      | find-repeated-substrings \
          -v maxlen=${maxlen} \
          -v wsep=':' \
      | revbytes | sort | revbytes \
      > rep/${s}/raw.rep
  end