#! /bin/gawk -f # Last edited on 2004-01-30 06:39:06 by stolfi # Tries to convert from the codes used in the # HTML Transliterated Quran (HAR encoding) # to hexbytes (lowercase byte of Unicode in hexadecimal and «»). # These rules are mustly guesses, and almost surely wrong. /^[ ]*[\#@]/ { print; next; } // { lin = $0; # Map tabs and non-breaking spaces to plain space: gsub(/[\011\240]/, " ", lin); # Delete boldface (silent/spoken) markup: gsub(/[Ss][Tt][Rr][Oo][Nn][Gg]>/, "B>", lin); gsub(/<[\/]*[bB]>/, "", lin); # Replaces ... (and its equivalent eversion) by .... gsub(/<[uU]><[iI]>/, "", lin); gsub(/<\/[iI]><\/[uU]>/, "", lin); gsub(/<[iI]><[uU]>/, "", lin); gsub(/<\/[uU]><\/[iI]>/, "", lin); # Normalize and to upper case: gsub(/[uU]>/, "U>", lin); gsub(/[iI]>/, "I>", lin); # Decode character by character. Note that consecutive characters # with the same markup (, , or ) may have been # fused. This loop breaks such groups into individual characters. change = 1; while(change) { tmp = gensub(/<([UIZ])>([tT][hH])([adhstnlADHSTNL ])/, "<\\1>\\2<\\1>\\3", "g", lin); change = (tmp != lin); lin = tmp; if (! change) { tmp = gensub(/<([UIZ])>([tT])([adstnlADSTNL ])/, "<\\1>\\2<\\1>\\3", "g", lin); change = (tmp != lin); lin = tmp; } if (! change) { tmp = gensub(/<([UIZ])>([adhsnlADHSNL ])([adhstnlADHSTNL ])/, "<\\1>\\2<\\1>\\3", "g", lin); change = (tmp != lin); lin = tmp; } } gsub(/<[UIZ]> <\/[UIZ]>/, " ", lin); # Protect "/" in closing HTML tags: gsub(/<[\/]/, "<@", lin); # ------------------------------------------------------------ # Compress each HAR code to "«mZ»" where "m" is the markup [uUzZnN] # and "Z" is the original 1- or 2-letter code: # Put a "×" in front of each unconverted character: lin = gensub(/(.)/, "×\\1", "g", lin); # Undo it for spaces, digits, punctuation: lin = gensub(/[×]([- 0-9,._])/, "\\1", "g", lin); # Undo it for HTML markup tags: lin = gensub(/[×][<][×]([a-zA-Z])[×][>]/, "<\\1>", "g", lin); lin = gensub(/[×][<][×][@][×]([a-zA-Z])[×][>]/, "<@\\1>", "g", lin); # Compress two-letter codes with HTML markup: lin = gensub(/[<]([UIZ])[>][×]([Tt])[×]([h])[<][@][UIZ][>]/, "«\\1\\2\\3»", "g", lin); # Compress single-letter codes with HTML markup: lin = gensub(/[<]([UIZ])[>][×]([AaDdHhSsTtNnLl])[<][@][UIZ][>]/, "«\\1\\2»", "g", lin); # Compress double-letter codes, plain (fake an "N" markup): lin = gensub(/[×]A[×]A/, "«NAA»", "g", lin); lin = gensub(/[×]([Ee])[×]e/, "«N\\1e»", "g", lin); lin = gensub(/[×]([Oo])[×]o/, "«N\\1o»", "g", lin); lin = gensub(/[×]([gktsGKTS])[×][h]/, "«N\\1h»", "g", lin); # ompress single-letter codes, plain (fake an "N" markup): lin = gensub(/[×]([\/abdfhijklmnoqrstuwyzABDFHIJKLMNOQRSTUWYZ])/, "«N\\1»", "g", lin); # ------------------------------------------------------------ # Now map compressed HAR codes "«Z»" to Unicode hexbyte "«xx»". # Uppercase: gsub(/[«]UL[»]/, "^«**»{ul}", lin); # Possible typo? gsub(/[«]UN[»]/, "^«**»{un}", lin); # Possible typo? gsub(/[«]UA[»]/, "^«4E»", lin); gsub(/[«]NB[»]/, "^«28»", lin); gsub(/[«]NT[»]/, "^«2A»", lin); gsub(/[«]NTh[»]/, "^«2B»", lin); gsub(/[«]NJ[»]/, "^«2C»", lin); gsub(/[«]UH[»]/, "^«2D»", lin); gsub(/[«]NKh[»]/, "^«2E»", lin); gsub(/[«]ND[»]/, "^«2F»", lin); gsub(/[«]UTh[»]/, "^«30»", lin); gsub(/[«]NR[»]/, "^«31»", lin); gsub(/[«]NZ[»]/, "^«32»", lin); gsub(/[«]NS[»]/, "^«33»", lin); gsub(/[«]NSh[»]/, "^«34»", lin); gsub(/[«]US[»]/, "^«35»", lin); gsub(/[«]UD[»]/, "^«36»", lin); gsub(/[«]UT[»]/, "^«37»", lin); gsub(/[«]ZTh[»]/, "^«38»", lin); gsub(/[«]NGh[»]/, "^«3A»", lin); gsub(/[«]NF[»]/, "^«41»", lin); gsub(/[«]NQ[»]/, "^«42»", lin); gsub(/[«]NK[»]/, "^«43»", lin); gsub(/[«]NL[»]/, "^«44»", lin); gsub(/[«]NM[»]/, "^«45»", lin); gsub(/[«]NN[»]/, "^«46»", lin); gsub(/[«]NH[»]/, "^«47»", lin); gsub(/[«]NOo[»]/, "^«48»", lin); gsub(/[«]NW[»]/, "^«48»", lin); gsub(/[«]NEe[»]/, "^«4A»", lin); gsub(/[«]NY[»]/, "^«4A»", lin); gsub(/[«]NA[»]/, "^«4E»", lin); gsub(/[«]NO[»]/, "^«4E»", lin); gsub(/[«]NU[»]/, "^«4F»", lin); gsub(/[«]NI[»]/, "^«50»", lin); # Lowercase: gsub(/[«]Ul[»]/, "«**»{ul}", lin); # Possible typo? gsub(/[«]Un[»]/, "«**»{un}", lin); # Possible typo? gsub(/[«]N[\/][»]/, "«21»", lin); gsub(/[«]Ua[»]/, "«4E»", lin); gsub(/[«]Nb[»]/, "«28»", lin); gsub(/[«]Nt[»]/, "«2A»", lin); gsub(/[«]Nth[»]/, "«2B»", lin); gsub(/[«]Nj[»]/, "«2C»", lin); gsub(/[«]Uh[»]/, "«2D»", lin); gsub(/[«]Nkh[»]/, "«2E»", lin); gsub(/[«]Nd[»]/, "«2F»", lin); gsub(/[«]Uth[»]/, "«30»", lin); gsub(/[«]Nr[»]/, "«31»", lin); gsub(/[«]Nz[»]/, "«32»", lin); gsub(/[«]Ns[»]/, "«33»", lin); gsub(/[«]Nsh[»]/, "«34»", lin); gsub(/[«]Us[»]/, "«35»", lin); gsub(/[«]Ud[»]/, "«36»", lin); gsub(/[«]Ut[»]/, "«37»", lin); gsub(/[«]Zth[»]/, "«38»", lin); gsub(/[«]NAA[»]/, "«39»", lin); gsub(/[«]Ngh[»]/, "«3A»", lin); gsub(/[«]Nf[»]/, "«41»", lin); gsub(/[«]Nq[»]/, "«42»", lin); gsub(/[«]Nk[»]/, "«43»", lin); gsub(/[«]Nl[»]/, "«44»", lin); gsub(/[«]Nm[»]/, "«45»", lin); gsub(/[«]Nn[»]/, "«46»", lin); gsub(/[«]Nh[»]/, "«47»", lin); gsub(/[«]Nw[»]/, "«48»", lin); gsub(/[«]Noo[»]/, "«48»", lin); gsub(/[«]Ny[»]/, "«4A»", lin); gsub(/[«]Nee[»]/, "«4A»", lin); gsub(/[«]Na[»]/, "«4E»", lin); gsub(/[«]No[»]/, "«4E»", lin); gsub(/[«]Nu[»]/, "«4F»", lin); gsub(/[«]Ni[»]/, "«50»", lin); # OK, now get rid of the uppercase marker . # (Not worth the trouble to take it into account below.) gsub(/[\^][«]/, "«", lin); # Restore "/" in HTML closing tags: gsub(/<[@]/, "/, "\n\n", lin); gsub(/<\/[pP]>/, "\n\n", lin); # Check for leftover HAR letters: lin = gensub(/[×]([a-zA-Z])/, "«**»{\\1}", "g", lin); # Remove "×" marks before other chars: gsub(/[×]/, "", lin); print lin; } function data_error(msg) { printf "%s:%s: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }