#! /bin/csh -f # Last edited on 2004-02-04 05:30:04 by stolfi # Extracts words from the raw (UTF-8) Quran, for checking purposes. # Converts them to JSAR. gawk \ ' /^ *([\#]|$)/ { next; } \ /^ *[@](chapter|part|fix)/ { next; } \ /^ *[@]verse/ { \ $0 = gensub(/@verse *{(.*)}{(.*)}/, "\\2 ", "g", $0); \ } \ /^ *[@]= *$/ { $0 = "/ ="; } \ /^ *[@]/ { next; } \ /./ { gsub(/[{][^{}]*[}]/, ""); print; next; } \ ' \ | html-to-hexbytes \ | hexbytes-to-jshb \ | gawk \ ' /^[ ]/ { gsub(/[ ]+[p] *$/, ""); } \ /./ { \ gsub(/[,]+/, " , "); \ gsub(/[ _]+/, "\n"); \ print; next; \ } \ ' \ | egrep -v '^ *$'