#! /usr/bin/gawk -f # Last edited on 2004-02-28 02:36:31 by stolfi # Preprocess the New Testament in Vietnamese (VIQR) BEGIN { abort = -1; usage = ( ARGV[0] " < INPUT.src > OUTPUT.src" ); # Fixes the encoding of accents in Vietnamese VIQR text } (abort >= 0) { exit abort; } /^[\#] +File +.*[.]txt/ { $0 = gensub( \ /^[\#] +File +20([0-9])-(.*)[.]txt/, \ "@section 1 {b\\1}\n\n# File 20\\1-\\2.txt\n\n", "s", $0 \ ); print; next; } /^ *[A-Z][A-Z][A-Z][ ]/ { $0 = gensub( \ /^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:](1)([^0-9]|$)/, \ "@section 2 {c\\2}\n\n@section 3 {v\\3}\n\n {\\1:\\2:\\3}", "s", $0 \ ); $0 = gensub( \ /^ *([A-Z][A-Z][A-Z])[ ]([0-9]+)[:]([0-9]+)([^0-9]|$)/, \ "@section 3 {v\\3}\n\n {\\1:\\2:\\3}\\4", "s", $0 \ ); print; next; } /^[ \011]*([\#@]|$)/ { print; next; } /./ { # Accent fixes $0 = remap_accents($0); # General contents line cleanup gsub(/[ \011]+$/, "", $0); gsub(/[ \011]+/, " ", $0); gsub(/^[ \011]+/, " ", $0); # insert leading spaces gsub(/^[ ]*/, " ", $0); print; next; } END { if (abort >= 0) { exit abort; } } function remap_accents(w) { # # Remap accent codes to avoid confusion with punctuation: # # dot-below "." -> "°" # breve "(" -> "µ" # hook "?" -> "ß" # w = gensub(/([Aa])[\(]/, "\\1µ", "g", w); w = gensub(/([Aa][µ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[.]/, "\\1°", "g", w); w = gensub(/([Aa][µ\^]?|[Oo][\+\^]?|[Uu][\+]?|[Ee][\^]?|[IiYy])[?]/, "\\1ß", "g", w); # Remove "\"-protection from puncts w = gensub(/[\\]([.?])/, " \\1 ", "g", w); # Space out brackets and parentheses w = gensub(/([][()])/, " \\1 ", "g", w); # Replace some common quote patterns w = gensub(/[:][ ]*[\"]/, ": « ", "g", w); w = gensub(/[\"] *([,;.?!)])/, " » \\1", "g", w); w = gensub(/([.!?]) *[\"] *([-]|$)/, "\\1 » \\2", "g", w); w = gensub(/^[ ]*[\"]/, "« ", "g", w); w = gensub(/[ ][\"]([^ ])/, " « \\1", "g", w); w = gensub(/([^ ])[\"][ ]/, "\\1 » ", "g", w); # Isolate all punctuation: w = gensub(/([,.:;?!])/, " \\1 ", "g", w); # Except semicolons in verse numbers: w = gensub(/ +[:] *([0-9]+)/, ":\\1", "g", w); return w; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "** usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: ** %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function load_lowercase_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "wmap[ORIGINAL] = NEW". nMap=0; split("", wmap) while((getline lin < file) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld, " "); if (nfld != 2) error(("bad table entry = \"" lin "\"")); if (fld[1] in wmap) error(("repeated key = \"" lin "\"")); wmap[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { arg_error((file ": " ERRNO)); } close (file); if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); } # printf "** loaded %6d map pairs\n", nMap > "/dev/stderr" }