#! /usr/bin/gawk -f
# Last edited on 2004-01-27 05:50:42 by stolfi

// {
  lin = $0; 

  # Map any characters beyond '\200' that are not in iso-8859-6 to
  # an invalid code.  But watch out:
  #   iso-9959-1 "»" = '\273' = iso-8859-1 ARABIC_SEMICOLON
  #   iso-9959-1 "«" = '\253' = iso-8859-1 invalid char
  
  lin = gensub(/[\200-\237]/, "\201{&}", "g", lin);
  lin = gensub(/[\241-\243]/, "\201{&}", "g", lin);
  lin = gensub(/[\245-\253]/, "\201{&}", "g", lin);
  lin = gensub(/[\256-\272]/, "\201{&}", "g", lin);
  lin = gensub(/[\274-\276]/, "\201{&}", "g", lin);
  lin = gensub(/[\300]/,      "\201{&}", "g", lin);
  lin = gensub(/[\333-\337]/, "\201{&}", "g", lin);
  lin = gensub(/[\363-\377]/, "\201{&}", "g", lin);

  # These characters have irregular Unicode points:
  gsub(/[\240]/,   " ", lin); # NO-BREAK SPACE
  gsub(/[\254]/, "«0C»", lin); # ARABIC COMMA 

  # The following characters map trivially to Unicode (just add u0600)
  gsub(/[\273]/, "«1B»", lin); # ARABIC SEMICOLON

  gsub(/[\277]/, "«1F»", lin); # ARABIC QUESTION MARK

  gsub(/[\301]/, "«21»", lin); # ARABIC LETTER HAMZA
  gsub(/[\302]/, "«22»", lin);
  gsub(/[\303]/, "«23»", lin);
  gsub(/[\304]/, "«24»", lin);
  gsub(/[\305]/, "«25»", lin);
  gsub(/[\306]/, "«26»", lin);
  gsub(/[\307]/, "«27»", lin);
  gsub(/[\310]/, "«28»", lin);
  gsub(/[\311]/, "«29»", lin);
  gsub(/[\312]/, "«2A»", lin);
  gsub(/[\313]/, "«2B»", lin);
  gsub(/[\314]/, "«2C»", lin);
  gsub(/[\315]/, "«2D»", lin);
  gsub(/[\316]/, "«2E»", lin);
  gsub(/[\317]/, "«2F»", lin);

  gsub(/[\320]/, "«30»", lin);
  gsub(/[\321]/, "«31»", lin);
  gsub(/[\322]/, "«32»", lin);
  gsub(/[\323]/, "«33»", lin);
  gsub(/[\324]/, "«34»", lin);
  gsub(/[\325]/, "«35»", lin);
  gsub(/[\326]/, "«36»", lin);
  gsub(/[\327]/, "«37»", lin);
  gsub(/[\330]/, "«38»", lin);
  gsub(/[\331]/, "«39»", lin);
  gsub(/[\332]/, "«3A»", lin); # ARABIC LETTER GHAIN

  gsub(/[\340]/, "«40»", lin); # ARABIC TATWEEL

  gsub(/[\341]/, "«41»", lin); # ARABIC LETTER FEH
  gsub(/[\342]/, "«42»", lin);
  gsub(/[\343]/, "«43»", lin);
  gsub(/[\344]/, "«44»", lin);
  gsub(/[\345]/, "«45»", lin);
  gsub(/[\346]/, "«46»", lin);
  gsub(/[\347]/, "«47»", lin);
  gsub(/[\350]/, "«48»", lin);
  gsub(/[\351]/, "«49»", lin);
  gsub(/[\352]/, "«4A»", lin);
  gsub(/[\353]/, "«4B»", lin);
  gsub(/[\354]/, "«4C»", lin);
  gsub(/[\355]/, "«4D»", lin);
  gsub(/[\356]/, "«4E»", lin);
  gsub(/[\357]/, "«4F»", lin); # ARABIC DAMMA

  gsub(/[\360]/, "«50»", lin); # ARABIC KASRA
  gsub(/[\361]/, "«51»", lin);
  gsub(/[\362]/, "«52»", lin); # ARABIC SUKUN

  # Now take care of invalid chars:
  gsub(/[\201]{/, "«**»{", lin);

  # Spaces between Arabic words:
  gsub(/[»][ ]+[«]/, "»«__»«", lin);
  print lin;
  next;
}