#! /bin/gawk -f
# Last edited on 2004-01-29 15:59:46 by stolfi

# Extracts words from the raw (HTML) Quran, for checking purposes.
# Converts them to JSAR.

/^ *([\#]|$)/ { next; }

/./ {
  lin = $0;

  # Remove sukun:
  # gsub(/[°]/, "", lin);

  $0 = lin;
}

/^ *[@]/ { gsub(/[{}]/, " ", $0); }
/^ *[@]chapter/ { 
  n = $2; t = $4; 
  gsub(/[_]/, "\n", t); 
  gsub(/[/]/, "\n/\n", t); 
  printf "%s.\n%s", n, t; 
  next;
}
/^ *[@]verse/ { printf "%s.%s.\n", $2, $3; next; }
/^ *[@][=]/ { printf "\n÷\n=\n"; next; }
/^ *[@]/ { next; }

/./ {
  # Remove known crud: 
  lin = gensub(/[{][^{}]*[}]/, "", "g", lin);
  # Isolate punctuation
  lin = gensub(/[.\/]/, " \\0 ", "g", lin);
  # Normalize spaces
  gsub(/^[ ]*/, "", lin);
  gsub(/[ ][ ]+/, " ", lin);
  # Normalize end of paragraph:
  gsub(/[_]*[=]/, "_÷_=_", lin);
  # Break words and write:
  gsub(/[ ]/, "\n", lin);
  gsub(/[_]/, "\n", lin);
  printf "%s", lin;
}