#! /bin/gawk -f
# Last edited on 2004-01-30 06:39:06 by stolfi
# Tries to convert from the codes used in the
# HTML Transliterated Quran (HAR encoding)
# to hexbytes (lowercase byte of Unicode in hexadecimal and «»).
# These rules are mustly guesses, and almost surely wrong.
/^[ ]*[\#@]/ {
print;
next;
}
// {
lin = $0;
# Map tabs and non-breaking spaces to plain space:
gsub(/[\011\240]/, " ", lin);
# Delete boldface (silent/spoken) markup:
gsub(/[Ss][Tt][Rr][Oo][Nn][Gg]>/, "B>", lin);
gsub(/<[\/]*[bB]>/, "", lin);
# Replaces ... (and its equivalent eversion) by ....
gsub(/<[uU]><[iI]>/, "", lin);
gsub(/<\/[iI]><\/[uU]>/, "", lin);
gsub(/<[iI]><[uU]>/, "", lin);
gsub(/<\/[uU]><\/[iI]>/, "", lin);
# Normalize and to upper case:
gsub(/[uU]>/, "U>", lin);
gsub(/[iI]>/, "I>", lin);
# Decode character by character. Note that consecutive characters
# with the same markup (, , or ) may have been
# fused. This loop breaks such groups into individual characters.
change = 1;
while(change)
{ tmp = gensub(/<([UIZ])>([tT][hH])([adhstnlADHSTNL ])/, "<\\1>\\2\\1><\\1>\\3", "g", lin);
change = (tmp != lin);
lin = tmp;
if (! change)
{ tmp = gensub(/<([UIZ])>([tT])([adstnlADSTNL ])/, "<\\1>\\2\\1><\\1>\\3", "g", lin);
change = (tmp != lin);
lin = tmp;
}
if (! change)
{ tmp = gensub(/<([UIZ])>([adhsnlADHSNL ])([adhstnlADHSTNL ])/, "<\\1>\\2\\1><\\1>\\3", "g", lin);
change = (tmp != lin);
lin = tmp;
}
}
gsub(/<[UIZ]> <\/[UIZ]>/, " ", lin);
# Protect "/" in closing HTML tags:
gsub(/<[\/]/, "<@", lin);
# ------------------------------------------------------------
# Compress each HAR code to "«mZ»" where "m" is the markup [uUzZnN]
# and "Z" is the original 1- or 2-letter code:
# Put a "×" in front of each unconverted character:
lin = gensub(/(.)/, "×\\1", "g", lin);
# Undo it for spaces, digits, punctuation:
lin = gensub(/[×]([- 0-9,._])/, "\\1", "g", lin);
# Undo it for HTML markup tags:
lin = gensub(/[×][<][×]([a-zA-Z])[×][>]/, "<\\1>", "g", lin);
lin = gensub(/[×][<][×][@][×]([a-zA-Z])[×][>]/, "<@\\1>", "g", lin);
# Compress two-letter codes with HTML markup:
lin = gensub(/[<]([UIZ])[>][×]([Tt])[×]([h])[<][@][UIZ][>]/, "«\\1\\2\\3»", "g", lin);
# Compress single-letter codes with HTML markup:
lin = gensub(/[<]([UIZ])[>][×]([AaDdHhSsTtNnLl])[<][@][UIZ][>]/, "«\\1\\2»", "g", lin);
# Compress double-letter codes, plain (fake an "N" markup):
lin = gensub(/[×]A[×]A/, "«NAA»", "g", lin);
lin = gensub(/[×]([Ee])[×]e/, "«N\\1e»", "g", lin);
lin = gensub(/[×]([Oo])[×]o/, "«N\\1o»", "g", lin);
lin = gensub(/[×]([gktsGKTS])[×][h]/, "«N\\1h»", "g", lin);
# ompress single-letter codes, plain (fake an "N" markup):
lin = gensub(/[×]([\/abdfhijklmnoqrstuwyzABDFHIJKLMNOQRSTUWYZ])/, "«N\\1»", "g", lin);
# ------------------------------------------------------------
# Now map compressed HAR codes "«Z»" to Unicode hexbyte "«xx»".
# Uppercase:
gsub(/[«]UL[»]/, "^«**»{ul}", lin); # Possible typo?
gsub(/[«]UN[»]/, "^«**»{un}", lin); # Possible typo?
gsub(/[«]UA[»]/, "^«4E»", lin);
gsub(/[«]NB[»]/, "^«28»", lin);
gsub(/[«]NT[»]/, "^«2A»", lin);
gsub(/[«]NTh[»]/, "^«2B»", lin);
gsub(/[«]NJ[»]/, "^«2C»", lin);
gsub(/[«]UH[»]/, "^«2D»", lin);
gsub(/[«]NKh[»]/, "^«2E»", lin);
gsub(/[«]ND[»]/, "^«2F»", lin);
gsub(/[«]UTh[»]/, "^«30»", lin);
gsub(/[«]NR[»]/, "^«31»", lin);
gsub(/[«]NZ[»]/, "^«32»", lin);
gsub(/[«]NS[»]/, "^«33»", lin);
gsub(/[«]NSh[»]/, "^«34»", lin);
gsub(/[«]US[»]/, "^«35»", lin);
gsub(/[«]UD[»]/, "^«36»", lin);
gsub(/[«]UT[»]/, "^«37»", lin);
gsub(/[«]ZTh[»]/, "^«38»", lin);
gsub(/[«]NGh[»]/, "^«3A»", lin);
gsub(/[«]NF[»]/, "^«41»", lin);
gsub(/[«]NQ[»]/, "^«42»", lin);
gsub(/[«]NK[»]/, "^«43»", lin);
gsub(/[«]NL[»]/, "^«44»", lin);
gsub(/[«]NM[»]/, "^«45»", lin);
gsub(/[«]NN[»]/, "^«46»", lin);
gsub(/[«]NH[»]/, "^«47»", lin);
gsub(/[«]NOo[»]/, "^«48»", lin);
gsub(/[«]NW[»]/, "^«48»", lin);
gsub(/[«]NEe[»]/, "^«4A»", lin);
gsub(/[«]NY[»]/, "^«4A»", lin);
gsub(/[«]NA[»]/, "^«4E»", lin);
gsub(/[«]NO[»]/, "^«4E»", lin);
gsub(/[«]NU[»]/, "^«4F»", lin);
gsub(/[«]NI[»]/, "^«50»", lin);
# Lowercase:
gsub(/[«]Ul[»]/, "«**»{ul}", lin); # Possible typo?
gsub(/[«]Un[»]/, "«**»{un}", lin); # Possible typo?
gsub(/[«]N[\/][»]/, "«21»", lin);
gsub(/[«]Ua[»]/, "«4E»", lin);
gsub(/[«]Nb[»]/, "«28»", lin);
gsub(/[«]Nt[»]/, "«2A»", lin);
gsub(/[«]Nth[»]/, "«2B»", lin);
gsub(/[«]Nj[»]/, "«2C»", lin);
gsub(/[«]Uh[»]/, "«2D»", lin);
gsub(/[«]Nkh[»]/, "«2E»", lin);
gsub(/[«]Nd[»]/, "«2F»", lin);
gsub(/[«]Uth[»]/, "«30»", lin);
gsub(/[«]Nr[»]/, "«31»", lin);
gsub(/[«]Nz[»]/, "«32»", lin);
gsub(/[«]Ns[»]/, "«33»", lin);
gsub(/[«]Nsh[»]/, "«34»", lin);
gsub(/[«]Us[»]/, "«35»", lin);
gsub(/[«]Ud[»]/, "«36»", lin);
gsub(/[«]Ut[»]/, "«37»", lin);
gsub(/[«]Zth[»]/, "«38»", lin);
gsub(/[«]NAA[»]/, "«39»", lin);
gsub(/[«]Ngh[»]/, "«3A»", lin);
gsub(/[«]Nf[»]/, "«41»", lin);
gsub(/[«]Nq[»]/, "«42»", lin);
gsub(/[«]Nk[»]/, "«43»", lin);
gsub(/[«]Nl[»]/, "«44»", lin);
gsub(/[«]Nm[»]/, "«45»", lin);
gsub(/[«]Nn[»]/, "«46»", lin);
gsub(/[«]Nh[»]/, "«47»", lin);
gsub(/[«]Nw[»]/, "«48»", lin);
gsub(/[«]Noo[»]/, "«48»", lin);
gsub(/[«]Ny[»]/, "«4A»", lin);
gsub(/[«]Nee[»]/, "«4A»", lin);
gsub(/[«]Na[»]/, "«4E»", lin);
gsub(/[«]No[»]/, "«4E»", lin);
gsub(/[«]Nu[»]/, "«4F»", lin);
gsub(/[«]Ni[»]/, "«50»", lin);
# OK, now get rid of the uppercase marker .
# (Not worth the trouble to take it into account below.)
gsub(/[\^][«]/, "«", lin);
# Restore "/" in HTML closing tags:
gsub(/<[@]/, "", lin);
# Replacement of word-initial weak+alif by alif (+ weak).
# (Well, I guess this is better than leaving them there...).
lin = gensub(/(^|[^»])«4E»«27»«4E»/, "\\1«27»«4E»", "g", lin);
lin = gensub(/(^|[^»])«4E»«27»/, "\\1«27»", "g", lin);
lin = gensub(/(^|[^»])«4F»«27»«4F»/, "\\1«27»«4F»", "g", lin);
lin = gensub(/(^|[^»])«4F»«27»/, "\\1«27»«4F»", "g", lin);
lin = gensub(/(^|[^»])«50»«27»«50»/, "\\1«27»«50»", "g", lin);
lin = gensub(/(^|[^»])«50»«27»/, "\\1«27»«50»", "g", lin);
# Replacement of other word-initial weak letters by alif (+ weak).
# (Well, I guess this is better than leaving them there...).
lin = gensub(/(^|[^»])«4E»/, "\\1«27»", "g", lin);
lin = gensub(/(^|[^»])«50»/, "\\1«27»«50»", "g", lin);
# Replacement of double weak letters by dammatan, etc.
# (This is even more doubtful than the above...).
lin = gensub(/«4E»«4E»/, "«4B»", "g", lin);
lin = gensub(/«4F»«4F»/, "«4C»", "g", lin);
lin = gensub(/«50»«50»/, "«4D»", "g", lin);
# Replacement of other weak+weak by weak+alif+weak.
# (This is even more doubtful than the above...).
lin = gensub(/«4E»«4F»/, "«4E»«27»«4F»", "g", lin);
lin = gensub(/«4E»«50»/, "«4E»«27»«50»", "g", lin);
lin = gensub(/«4F»«4E»/, "«4F»«27»«4E»", "g", lin);
lin = gensub(/«4F»«50»/, "«4F»«27»«50»", "g", lin);
lin = gensub(/«50»«4E»/, "«50»«27»«4E»", "g", lin);
lin = gensub(/«50»«4F»/, "«50»«27»«4F»", "g", lin);
# Conversion of slashes between hexbytes to "«//»".
# lin = gensub(/([.]|[»])[\/]([«])/, "\\1«//»\\2", "g", lin);
# Conversion of spaces between hexbytes to "«__»".
# lin = gensub(/([.]|[»])[ _]+([«])/, "\\1«__»\\2", "g", lin);
# Conversion of paragraph marks:
gsub(/<[pP]>/, "\n\n", lin);
gsub(/<\/[pP]>/, "\n\n", lin);
# Check for leftover HAR letters:
lin = gensub(/[×]([a-zA-Z])/, "«**»{\\1}", "g", lin);
# Remove "×" marks before other chars:
gsub(/[×]/, "", lin);
print lin;
}
function data_error(msg)
{
printf "%s:%s: %s\n", FILENAME, FNR, msg > "/dev/stderr";
abort = 1;
exit abort;
}