#! /bin/gawk -f # Last edited on 2004-01-29 15:59:46 by stolfi # Extracts words from the raw (HTML) Quran, for checking purposes. # Converts them to JSAR. /^ *([\#]|$)/ { next; } /./ { lin = $0; # Remove sukun: # gsub(/[°]/, "", lin); $0 = lin; } /^ *[@]/ { gsub(/[{}]/, " ", $0); } /^ *[@]chapter/ { n = $2; t = $4; gsub(/[_]/, "\n", t); gsub(/[/]/, "\n/\n", t); printf "%s.\n%s", n, t; next; } /^ *[@]verse/ { printf "%s.%s.\n", $2, $3; next; } /^ *[@][=]/ { printf "\n÷\n=\n"; next; } /^ *[@]/ { next; } /./ { # Remove known crud: lin = gensub(/[{][^{}]*[}]/, "", "g", lin); # Isolate punctuation lin = gensub(/[.\/]/, " \\0 ", "g", lin); # Normalize spaces gsub(/^[ ]*/, "", lin); gsub(/[ ][ ]+/, " ", lin); # Normalize end of paragraph: gsub(/[_]*[=]/, "_÷_=_", lin); # Break words and write: gsub(/[ ]/, "\n", lin); gsub(/[_]/, "\n", lin); printf "%s", lin; }