#! /bin/csh -f
# Last edited on 2004-01-30 06:55:52 by stolfi

# Extracts words from the raw (HTML) Quran, for checking purposes.
# Converts them to JSAR.

gawk \
     ' /^ *([\#]|$)/ { next; } \
       /[@]verse/ { \
         gsub(/[@]verse/, ""); gsub(/[{]/, ""); gsub(/[}]/, "."); \
         print; next; \
       } \
       /[@]chapter/ { \
         gsub(/[{}]/, " "); cn=$2; t=$4; \
         printf "%s.\n#@@ %s\n÷\n=\n", cn, t; next; \
       } \
       /[@][=]/ { printf "÷\n=\n"; next; } \
       /./ { printf "%s\n", $0; } \
     ' \
  | har-to-hexbytes \
  | hexbytes-to-jsar \
  | gawk \
     ' /[#][@][@]/ { gsub(/[#][@][@]/, " "); } \
       /./ { \
         gsub(/[_ ]+$/, ""); \
         gsub(/^[ ]*[_ ]*/, ""); gsub(/[_ ]+/, "\n"); \
         print; next; \
       } \
     ' \
  | egrep -v '^ *$'