#! /usr/bin/gawk -f
# Last edited on 2004-01-26 21:54:23 by stolfi

# Assumes that extract-html-tags was applied to the input.

# Replace relevant HTML tags by proto-"@"-directives.
# Preserve span nesting:
/^[<]/ {
  lin = $0;
  
  gsub(/[<]span[^<>]*lang=AR-SA[^<>]*font-size:18.0pt[^<>]*[>]/, "@begin-span-arabic-chapter-num", lin);
  gsub(/[<]span[^<>]*lang=AR-SA[^<>]*color[:][\#]008800[^<>]*[>]/, "@begin-span-arabic-chapter-title", lin);
  gsub(/[<]span[^<>]*color[:]green[^<>]*[>]/, "@begin-span-verse-num", lin);
  gsub(/[<]span[^<>]*lang=AR-SA[^<>]*[>]/, "@begin-span-arabic-text", lin);
  gsub(/[<]span[^<>]*[>]/, "@begin-span", lin);
  gsub(/[<][\/] *span *[>]/, "@end-span", lin);
  lin = gensub(/[<][^<>]*href=[\"]sura([0-9]+)_files[\/][^<>]*[>]/, "@chapter{\\1}", "g", lin);
  lin = gensub(/[<][^<>]*href=[^<>]*chapterNo=([0-9]+)[^0-9<>][^<>]*verseNo=([0-9]+)[^0-9<>][^<>]*[>]/, "@verse{\\1}{\\2}", "g", lin);
  lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*display_chapter[^<>]*[>]/, "", "g", lin);
  lin = gensub(/[<][^<>]*href=[^<>]*quran[\/]servlet[\/][^<>]*[>]/, "@verse{??}{??}", "g", lin);
  
  $0 = lin;
}

# Discard all other HTML tags.
/[<][^<>]*[>]/ { next; }

# Discard style directives and non-Quranic text.
/behavior[:]url/ { next; }
/Submission[.]org/ { next; }
/MicrosoftInternetExplorer4/ { next; }
/Style Definitions/ { next; }
/MsoNormalTable/ { next; }
/mso-style-name[:]/ { next; }
/^mso[-]/ { next; }
/^font[-]/ { next; }
/Your information source/ { next; }
/Plastic Surgery Network/ { next; }
/^PSN$/ { next; }
/^20??-??-?????[:]??[:]???$/ { next; }

# Print whatever is left.
//{
  lin = $0;
  
  # Replace HTML funny spaces by " ":
  gsub(/[&]nbsp[;]/, " ", lin);
  gsub(/[\t\240]/, " ", lin);
  
  print lin;
  next;
}