# Last edited on 2004-02-04 06:28:27 by stolfi
# Pentateuch (frst 5 books) of the New Chinese Bible,
# in GC encoding.

JOINING AND CLEANING THE HTML

  set imp = /home/staff/stolfi/IMPORT/texts/chinese/NewBible
  
  rm -f chapters.dir main.raw
  foreach f ( `cd ${imp} && ls [0-9]??-{gen,exo,lev,num,deu}-[0-9]??.htm | sort` )
    echo $f
    echo $f >> chapters.dir
    cat ${imp}/${f} \
      | tr -d '\015' \
      | remove-html-junk-c1 -v fname="$f" \
      | spread-gb-codes \
      >>  main.raw
  end
  
  Manually edited "main.raw" adding pre-directives (@part, @chapter, @verse, @=).