#! /bin/csh -f
# Last edited on 2004-02-04 05:30:04 by stolfi

# Extracts words from the raw (UTF-8) Quran, for checking purposes.
# Converts them to JSAR.

gawk \
    ' /^ *([\#]|$)/ { next; } \
      /^ *[@](chapter|part|fix)/ { next; } \
      /^ *[@]verse/ { \
        $0 = gensub(/@verse *{(.*)}{(.*)}/, "\\2 ", "g", $0); \
      } \
      /^ *[@]= *$/ { $0 = "/ ="; } \
      /^ *[@]/ { next; } \
      /./ { gsub(/[{][^{}]*[}]/, ""); print; next; } \
    ' \
  | html-to-hexbytes \
  | hexbytes-to-jshb \
  | gawk \
    ' /^[ ]/ { gsub(/[ ]+[p] *$/, ""); } \
      /./ { \
        gsub(/[,]+/, " , "); \
        gsub(/[ _]+/, "\n"); \
        print; next; \
      } \
    ' \
  | egrep -v '^ *$'