#! /usr/bin/gawk -f
# Last edited on 2004-01-27 05:50:42 by stolfi
// {
lin = $0;
# Map any characters beyond '\200' that are not in iso-8859-6 to
# an invalid code. But watch out:
# iso-9959-1 "»" = '\273' = iso-8859-1 ARABIC_SEMICOLON
# iso-9959-1 "«" = '\253' = iso-8859-1 invalid char
lin = gensub(/[\200-\237]/, "\201{&}", "g", lin);
lin = gensub(/[\241-\243]/, "\201{&}", "g", lin);
lin = gensub(/[\245-\253]/, "\201{&}", "g", lin);
lin = gensub(/[\256-\272]/, "\201{&}", "g", lin);
lin = gensub(/[\274-\276]/, "\201{&}", "g", lin);
lin = gensub(/[\300]/, "\201{&}", "g", lin);
lin = gensub(/[\333-\337]/, "\201{&}", "g", lin);
lin = gensub(/[\363-\377]/, "\201{&}", "g", lin);
# These characters have irregular Unicode points:
gsub(/[\240]/, " ", lin); # NO-BREAK SPACE
gsub(/[\254]/, "«0C»", lin); # ARABIC COMMA
# The following characters map trivially to Unicode (just add u0600)
gsub(/[\273]/, "«1B»", lin); # ARABIC SEMICOLON
gsub(/[\277]/, "«1F»", lin); # ARABIC QUESTION MARK
gsub(/[\301]/, "«21»", lin); # ARABIC LETTER HAMZA
gsub(/[\302]/, "«22»", lin);
gsub(/[\303]/, "«23»", lin);
gsub(/[\304]/, "«24»", lin);
gsub(/[\305]/, "«25»", lin);
gsub(/[\306]/, "«26»", lin);
gsub(/[\307]/, "«27»", lin);
gsub(/[\310]/, "«28»", lin);
gsub(/[\311]/, "«29»", lin);
gsub(/[\312]/, "«2A»", lin);
gsub(/[\313]/, "«2B»", lin);
gsub(/[\314]/, "«2C»", lin);
gsub(/[\315]/, "«2D»", lin);
gsub(/[\316]/, "«2E»", lin);
gsub(/[\317]/, "«2F»", lin);
gsub(/[\320]/, "«30»", lin);
gsub(/[\321]/, "«31»", lin);
gsub(/[\322]/, "«32»", lin);
gsub(/[\323]/, "«33»", lin);
gsub(/[\324]/, "«34»", lin);
gsub(/[\325]/, "«35»", lin);
gsub(/[\326]/, "«36»", lin);
gsub(/[\327]/, "«37»", lin);
gsub(/[\330]/, "«38»", lin);
gsub(/[\331]/, "«39»", lin);
gsub(/[\332]/, "«3A»", lin); # ARABIC LETTER GHAIN
gsub(/[\340]/, "«40»", lin); # ARABIC TATWEEL
gsub(/[\341]/, "«41»", lin); # ARABIC LETTER FEH
gsub(/[\342]/, "«42»", lin);
gsub(/[\343]/, "«43»", lin);
gsub(/[\344]/, "«44»", lin);
gsub(/[\345]/, "«45»", lin);
gsub(/[\346]/, "«46»", lin);
gsub(/[\347]/, "«47»", lin);
gsub(/[\350]/, "«48»", lin);
gsub(/[\351]/, "«49»", lin);
gsub(/[\352]/, "«4A»", lin);
gsub(/[\353]/, "«4B»", lin);
gsub(/[\354]/, "«4C»", lin);
gsub(/[\355]/, "«4D»", lin);
gsub(/[\356]/, "«4E»", lin);
gsub(/[\357]/, "«4F»", lin); # ARABIC DAMMA
gsub(/[\360]/, "«50»", lin); # ARABIC KASRA
gsub(/[\361]/, "«51»", lin);
gsub(/[\362]/, "«52»", lin); # ARABIC SUKUN
# Now take care of invalid chars:
gsub(/[\201]{/, "«**»{", lin);
# Spaces between Arabic words:
gsub(/[»][ ]+[«]/, "»«__»«", lin);
print lin;
next;
}