#! /usr/bin/gawk -f
# Last edited on 2004-01-26 21:54:23 by stolfi
# Assumes that extract-html-tags was applied to the input.
# Replace relevant HTML tags by proto-"@"-directives.
# Preserve span nesting:
/^[<]/ {
lin = $0;
gsub(/[<]span[^<>]*lang=AR-SA[^<>]*font-size:18.0pt[^<>]*[>]/, "@begin-span-arabic-chapter-num", lin);
gsub(/[<]span[^<>]*lang=AR-SA[^<>]*color[:][\#]008800[^<>]*[>]/, "@begin-span-arabic-chapter-title", lin);
gsub(/[<]span[^<>]*color[:]green[^<>]*[>]/, "@begin-span-verse-num", lin);
gsub(/[<]span[^<>]*lang=AR-SA[^<>]*[>]/, "@begin-span-arabic-text", lin);
gsub(/[<]span[^<>]*[>]/, "@begin-span", lin);
gsub(/[<][\/] *span *[>]/, "@end-span", lin);
lin = gensub(/[<][^<>]*href=[\"]sura([0-9]+)_files[\/][^<>]*[>]/, "@chapter{\\1}", "g", lin);
lin = gensub(/[<][^<>]*href=[^<>]*chapterNo=([0-9]+)[^0-9<>][^<>]*verseNo=([0-9]+)[^0-9<>][^<>]*[>]/, "@verse{\\1}{\\2}", "g", lin);
lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*display_chapter[^<>]*[>]/, "", "g", lin);
lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*[>]/, "@verse{??}{??}", "g", lin);
$0 = lin;
}
# Discard all other HTML tags.
/[<][^<>]*[>]/ { next; }
# Discard style directives and non-Quranic text.
/behavior[:]url/ { next; }
/Submission[.]org/ { next; }
/MicrosoftInternetExplorer4/ { next; }
/Style Definitions/ { next; }
/MsoNormalTable/ { next; }
/mso-style-name[:]/ { next; }
/^mso[-]/ { next; }
/^font[-]/ { next; }
/Your information source/ { next; }
/Plastic Surgery Network/ { next; }
/^PSN$/ { next; }
/^20??-??-?????[:]??[:]???$/ { next; }
# Print whatever is left.
//{
lin = $0;
# Replace HTML funny spaces by " ":
gsub(/[&]nbsp[;]/, " ", lin);
gsub(/[\t\240]/, " ", lin);
print lin;
next;
}