#! /usr/bin/gawk -f # Last edited on 2004-01-26 21:54:23 by stolfi # Assumes that extract-html-tags was applied to the input. # Replace relevant HTML tags by proto-"@"-directives. # Preserve span nesting: /^[<]/ { lin = $0; gsub(/[<]span[^<>]*lang=AR-SA[^<>]*font-size:18.0pt[^<>]*[>]/, "@begin-span-arabic-chapter-num", lin); gsub(/[<]span[^<>]*lang=AR-SA[^<>]*color[:][\#]008800[^<>]*[>]/, "@begin-span-arabic-chapter-title", lin); gsub(/[<]span[^<>]*color[:]green[^<>]*[>]/, "@begin-span-verse-num", lin); gsub(/[<]span[^<>]*lang=AR-SA[^<>]*[>]/, "@begin-span-arabic-text", lin); gsub(/[<]span[^<>]*[>]/, "@begin-span", lin); gsub(/[<][\/] *span *[>]/, "@end-span", lin); lin = gensub(/[<][^<>]*href=[\"]sura([0-9]+)_files[\/][^<>]*[>]/, "@chapter{\\1}", "g", lin); lin = gensub(/[<][^<>]*href=[^<>]*chapterNo=([0-9]+)[^0-9<>][^<>]*verseNo=([0-9]+)[^0-9<>][^<>]*[>]/, "@verse{\\1}{\\2}", "g", lin); lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*display_chapter[^<>]*[>]/, "", "g", lin); lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*[>]/, "@verse{??}{??}", "g", lin); $0 = lin; } # Discard all other HTML tags. /[<][^<>]*[>]/ { next; } # Discard style directives and non-Quranic text. /behavior[:]url/ { next; } /Submission[.]org/ { next; } /MicrosoftInternetExplorer4/ { next; } /Style Definitions/ { next; } /MsoNormalTable/ { next; } /mso-style-name[:]/ { next; } /^mso[-]/ { next; } /^font[-]/ { next; } /Your information source/ { next; } /Plastic Surgery Network/ { next; } /^PSN$/ { next; } /^20??-??-?????[:]??[:]???$/ { next; } # Print whatever is left. //{ lin = $0; # Replace HTML funny spaces by " ": gsub(/[&]nbsp[;]/, " ", lin); gsub(/[\t\240]/, " ", lin); print lin; next; }