# Last edited on 2004-02-04 05:10:47 by stolfi # Preparing the Hebrew pentateuch for analysis CLEANING UP THE HTML set imp = /home/staff/stolfi/IMPORT/texts/hebrew/Tanakh rm -f chapters.dir main.htm foreach f ( `cd ${imp} && ls [0-9]??-{gen,exo,lev,num,deu}-[0-9]??.htm | sort` ) echo $f echo $f >> chapters.dir echo "# === $f ============================================================" >> main.htm cat ${imp}/${f} \ | tr -d '\015' \ | remove-html-junk-t1 \ >> main.htm end Manually edited "main.raw" adding pre-directives (@part, @chapter, @verse, @=). STATISTICS Counting characters: cat main.raw \ | sed \ -e 's/@verse{.*}//g' \ -e '/^[#@]/d' \ | egrep -v '^ *([@\#]|$)' \ | html-to-hexbytes \ | count-hexbytes \ > .char.cts Checking for invalid codes: cat main.raw \ | sed \ -e 's/@verse{.*}//g' \ -e '/^[#@]/d' \ | egrep -v '^ *([@\#]|$)' \ | html-to-hexbytes \ | egrep '[&]' \ | head -10 Checking JSHB conversion: cat main.raw \ | sed \ -e 's/@verse{.*}//g' \ -e '/^[#@]/d' \ | egrep -v '^ *([@\#]|$)' \ | html-to-hexbytes \ | hexbytes-to-jshb \ | head -10 Checking for non-Hebrew characters: cat main.raw \ | sed \ -e 's/@verse{.*}//g' \ -e '/^[#@]/d' \ | egrep -v '^ *([@\#]|$)' \ | html-to-hexbytes \ | sed \ -e 's/[«][0-9A-F][[0-9A-F][»]//g' \ -e 's/[,]//g' \ -e '/^ *$/d' \ | head -10