#! /usr/bin/gawk -f
# Last edited on 2004-02-28 02:36:31 by stolfi
# Preprocess the New Testament in Vietnamese (VIQR)
BEGIN {
abort = -1;
usage = ( ARGV[0] " < INPUT.src > OUTPUT.src" );
# Fixes the encoding of accents in Vietnamese VIQR text
}
(abort >= 0) { exit abort; }
/^[\#] +File +.*[.]txt/ {
$0 = gensub( \
/^[\#] +File +20([0-9])-(.*)[.]txt/, \
"@section 1 {b\\1}\n\n# File 20\\1-\\2.txt\n\n", "s", $0 \
);
print;
next;
}
/^ *[A-Z][A-Z][A-Z][ ]/ {
$0 = gensub( \
/^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:](1)([^0-9]|$)/, \
"@section 2 {c\\2}\n\n@section 3 {v\\3}\n\n {\\1:\\2:\\3}", "s", $0 \
);
$0 = gensub( \
/^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:]([0-9]+)([^0-9]|$)/, \
"@section 3 {v\\3}\n\n {\\1:\\2:\\3}\\4", "s", $0 \
);
print;
next;
}
/^[ \011]*([\#@]|$)/ {
print;
next;
}
/./ {
# Accent fixes
$0 = remap_accents($0);
# General contents line cleanup
gsub(/[ \011]+$/, "", $0);
gsub(/[ \011]+/, " ", $0);
gsub(/^[ \011]+/, " ", $0);
# insert leading spaces
gsub(/^[ ]*/, " ", $0);
print;
next;
}
END {
if (abort >= 0) { exit abort; }
}
function remap_accents(w)
{
#
# Remap accent codes to avoid confusion with punctuation:
#
# dot-below "." -> "°"
# breve "(" -> "µ"
# hook "?" -> "ß"
#
w = gensub(/([Aa])[\(]/, "\\1µ", "g", w);
w = gensub(/([Aa][µ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[.]/, "\\1°", "g", w);
w = gensub(/([Aa][µ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[?]/, "\\1ß", "g", w);
# Remove "\"-protection from puncts
w = gensub(/[\\]([.?])/, " \\1 ", "g", w);
# Space out brackets and parentheses
w = gensub(/([][()])/, " \\1 ", "g", w);
# Replace some common quote patterns
w = gensub(/[:][ ]*[\"]/, ": « ", "g", w);
w = gensub(/[\"] *([,;.?!)])/, " » \\1", "g", w);
w = gensub(/([.!?]) *[\"] *([-]|$)/, "\\1 » \\2", "g", w);
w = gensub(/^[ ]*[\"]/, "« ", "g", w);
w = gensub(/[ ][\"]([^ ])/, " « \\1", "g", w);
w = gensub(/([^ ])[\"][ ]/, "\\1 » ", "g", w);
# Isolate all punctuation:
w = gensub(/([,.:;?!])/, " \\1 ", "g", w);
# Except semicolons in verse numbers:
w = gensub(/ +[:] *([0-9]+)/, ":\\1", "g", w);
return w;
}
function arg_error(msg)
{
printf "%s\n", msg > "/dev/stderr";
printf "** usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1;
}
function data_error(msg)
{
printf "line %d: ** %s\n", FNR, msg > "/dev/stderr";
abort = 1; exit 1;
}
function load_lowercase_table(file, nMap,lin,fld,nfld)
{
# Reads a word mapping table from "file", containing pairs
# of the form ORGINAL NEW.
# Stores the table in "wmap[ORIGINAL] = NEW".
nMap=0;
split("", wmap)
while((getline lin < file) > 0) {
if (! match(lin, /^[#]/))
{ nfld = split(lin, fld, " ");
if (nfld != 2) error(("bad table entry = \"" lin "\""));
if (fld[1] in wmap) error(("repeated key = \"" lin "\""));
wmap[fld[1]] = fld[2];
nMap++;
}
}
if (ERRNO != "0") { arg_error((file ": " ERRNO)); }
close (file);
if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); }
# printf "** loaded %6d map pairs\n", nMap > "/dev/stderr"
}