#! /usr/bin/gawk -f # Last edited on 2004-01-27 05:50:42 by stolfi // { lin = $0; # Map any characters beyond '\200' that are not in iso-8859-6 to # an invalid code. But watch out: # iso-9959-1 "»" = '\273' = iso-8859-1 ARABIC_SEMICOLON # iso-9959-1 "«" = '\253' = iso-8859-1 invalid char lin = gensub(/[\200-\237]/, "\201{&}", "g", lin); lin = gensub(/[\241-\243]/, "\201{&}", "g", lin); lin = gensub(/[\245-\253]/, "\201{&}", "g", lin); lin = gensub(/[\256-\272]/, "\201{&}", "g", lin); lin = gensub(/[\274-\276]/, "\201{&}", "g", lin); lin = gensub(/[\300]/, "\201{&}", "g", lin); lin = gensub(/[\333-\337]/, "\201{&}", "g", lin); lin = gensub(/[\363-\377]/, "\201{&}", "g", lin); # These characters have irregular Unicode points: gsub(/[\240]/, " ", lin); # NO-BREAK SPACE gsub(/[\254]/, "«0C»", lin); # ARABIC COMMA # The following characters map trivially to Unicode (just add u0600) gsub(/[\273]/, "«1B»", lin); # ARABIC SEMICOLON gsub(/[\277]/, "«1F»", lin); # ARABIC QUESTION MARK gsub(/[\301]/, "«21»", lin); # ARABIC LETTER HAMZA gsub(/[\302]/, "«22»", lin); gsub(/[\303]/, "«23»", lin); gsub(/[\304]/, "«24»", lin); gsub(/[\305]/, "«25»", lin); gsub(/[\306]/, "«26»", lin); gsub(/[\307]/, "«27»", lin); gsub(/[\310]/, "«28»", lin); gsub(/[\311]/, "«29»", lin); gsub(/[\312]/, "«2A»", lin); gsub(/[\313]/, "«2B»", lin); gsub(/[\314]/, "«2C»", lin); gsub(/[\315]/, "«2D»", lin); gsub(/[\316]/, "«2E»", lin); gsub(/[\317]/, "«2F»", lin); gsub(/[\320]/, "«30»", lin); gsub(/[\321]/, "«31»", lin); gsub(/[\322]/, "«32»", lin); gsub(/[\323]/, "«33»", lin); gsub(/[\324]/, "«34»", lin); gsub(/[\325]/, "«35»", lin); gsub(/[\326]/, "«36»", lin); gsub(/[\327]/, "«37»", lin); gsub(/[\330]/, "«38»", lin); gsub(/[\331]/, "«39»", lin); gsub(/[\332]/, "«3A»", lin); # ARABIC LETTER GHAIN gsub(/[\340]/, "«40»", lin); # ARABIC TATWEEL gsub(/[\341]/, "«41»", lin); # ARABIC LETTER FEH gsub(/[\342]/, "«42»", lin); gsub(/[\343]/, "«43»", lin); gsub(/[\344]/, "«44»", lin); gsub(/[\345]/, "«45»", lin); gsub(/[\346]/, "«46»", lin); gsub(/[\347]/, "«47»", lin); gsub(/[\350]/, "«48»", lin); gsub(/[\351]/, "«49»", lin); gsub(/[\352]/, "«4A»", lin); gsub(/[\353]/, "«4B»", lin); gsub(/[\354]/, "«4C»", lin); gsub(/[\355]/, "«4D»", lin); gsub(/[\356]/, "«4E»", lin); gsub(/[\357]/, "«4F»", lin); # ARABIC DAMMA gsub(/[\360]/, "«50»", lin); # ARABIC KASRA gsub(/[\361]/, "«51»", lin); gsub(/[\362]/, "«52»", lin); # ARABIC SUKUN # Now take care of invalid chars: gsub(/[\201]{/, "«**»{", lin); # Spaces between Arabic words: gsub(/[»][ ]+[«]/, "»«__»«", lin); print lin; next; }