# Last edited on 2004-02-04 05:10:47 by stolfi
# Preparing the Hebrew pentateuch for analysis

CLEANING UP THE HTML

  set imp = /home/staff/stolfi/IMPORT/texts/hebrew/Tanakh
  
  rm -f chapters.dir main.htm
  foreach f ( `cd ${imp} && ls [0-9]??-{gen,exo,lev,num,deu}-[0-9]??.htm | sort` )
    echo $f
    echo $f >> chapters.dir
    echo "# === $f ============================================================" >> main.htm
    cat ${imp}/${f} \
      | tr -d '\015' \
      | remove-html-junk-t1 \
      >>  main.htm
  end
  
  Manually edited "main.raw" adding pre-directives (@part, @chapter, @verse, @=).

STATISTICS

  Counting characters:

    cat main.raw \
      | sed \
          -e 's/@verse{.*}//g' \
          -e '/^[#@]/d' \
      | egrep -v '^ *([@\#]|$)' \
      | html-to-hexbytes \
      | count-hexbytes \
      > .char.cts

  Checking for invalid codes:

    cat main.raw \
      | sed \
          -e 's/@verse{.*}//g' \
          -e '/^[#@]/d' \
      | egrep -v '^ *([@\#]|$)' \
      | html-to-hexbytes \
      | egrep '[&]' \
      | head -10

  Checking JSHB conversion:

    cat main.raw \
      | sed \
          -e 's/@verse{.*}//g' \
          -e '/^[#@]/d' \
      | egrep -v '^ *([@\#]|$)' \
      | html-to-hexbytes \
      | hexbytes-to-jshb \
      | head -10

  Checking for non-Hebrew characters:

    cat main.raw \
      | sed \
          -e 's/@verse{.*}//g' \
          -e '/^[#@]/d' \
      | egrep -v '^ *([@\#]|$)' \
      | html-to-hexbytes \
      | sed \
          -e 's/[侷[0-9A-F][[0-9A-F][蒸//g' \
          -e 's/[,]//g' \
          -e '/^ *$/d' \
      | head -10