#! /usr/bin/gawk -f # Last edited on 2004-02-04 18:40:54 by stolfi # Preprocess the Pentateuch in Vietnamese (VIQR) BEGIN { abort = -1; usage = ( \ "cat main.org \\\n" \ " | preprocess-org \\\n" \ " | evt-from-org -v omitControls=1 -v omitBlanks=1 \\\n" \ " > main.evt" \ ); # Maps words through the "up-to-low.tbl" that should # map to lowercase all Vietnamese words, and capitalize # all proper names and transliterated hebrew words. # # Also replaces paragraph-final "=" by # @= (which will be re-converted into "="-lines by org-to-evt) nunitlines = 0; # Number of lines already seen in current parag. lastwasblank = 0; # TRUE if last line written was a blank #-comment. lastword = ""; # Last word in current unit. load_lowercase_table("up-to-low.tbl"); } (abort >= 0) { exit abort; } /^[#]/ { if ($2 == "ALPHA") { $4 = remap_accents($4); } print; next; } /^[ \011]*$/ { print; next; } /.[@\#]/ { data_error("@ or # not on column 1"); } /^[@](part|chapter|end)/ { end_current_unit(); print; next; } /^[@]unit[ ][A-Z][ ][+]?[0-9]+[ ][-A-Za-z0-9:.]+[ ]*$/ { un_type = $2; un_num = $3; un_name = $4; end_current_unit(); print; begin_new_unit(); next; } /^[@](format|include)/ { print; next; } /[@]/ { data_error("invalid @-directive"); } /./ { # General contents line cleanup gsub(/[ \011]+$/, "", $0); gsub(/[ \011]+/, " ", $0); gsub(/^[ \011]+/, " ", $0); fix_words(); lastword = $(NF); endparag = (lastword == "="); if (endparag) { NF--; if ((NF == 0) || ($(NF) !~ /^([.?!]|[.][.][.])$/)) { printf " «line %s : unexpected parag» ", FNR > "/dev/stderr"; } } nunitlines++; print; if (endparag) { print "@="; } next; } END { if (abort >= 0) { exit abort; } if (nunitlines > 0) { end_current_unit(); } } function fix_words( i,w) { # Apply case correction: for (i = 1; i <= NF; i++) { w = $(i); if (w in wmap) { w = wmap[w]; } $(i) = remap_accents(w); } } function remap_accents(w) { # # Remap accent codes to avoid confusion with punctuation: # # dot-below "." -> "°" # breve "(" -> "µ" # hook "?" -> "ß" # if ((w != ".") && (w != "...")) { gsub(/[.]/, "°", w); } if (w != "(") { gsub(/[\(]/, "µ", w); } if (w != "?") { gsub(/[?]/, "ß", w); } return w; } function end_current_unit() { # Finishes off the current paragraph, adding "@=" if necessary, if (nunitlines > 0) { if (lastword !~ /^([=:;,»]|[-][-])$/) { printf " «line %s : no punct» ", FNR > "/dev/stderr"; } # output_comment_line("#"); } nunitlines = 0; lastword = ""; } function begin_new_unit() { if (nunitlines != 0) { data_error("inconsistent nunitlines (2)"); } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function load_lowercase_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "wmap[ORIGINAL] = NEW". nMap=0; split("", wmap) while((getline lin < file) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld, " "); if (nfld != 2) error(("bad table entry = \"" lin "\"")); if (fld[1] in wmap) error(("repeated key = \"" lin "\"")); wmap[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { arg_error((file ": " ERRNO)); } close (file); if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); } printf "loaded %6d map pairs\n", nMap > "/dev/stderr" }