#! /usr/bin/gawk -f # Last edited on 2001-12-31 14:04:43 by stolfi # Process the Pentateuch in Vietnamese (VIQR) BEGIN { abort = -1; usage = ( "viet-bib-process \\\n" \ " < main.org > main.evt" \ ); # Lines will be numbered with where # X is a letter identifying the book (a,b,c...) # ppp is the chapter number (sequential through book) # U is a letter identifying the unit type: # nn is the versicle number within the unit # k is the line number in the versicle # # Reads file "up-to-low.tbl" with upper-to-lower mapping # of words that are not proper names. curbook = ""; # Current book code. curchapter = ""; # Current chapter number. curunit = ""; # Current unit code. curverse = ""; # Current verse number nunitlines = 0; # Number of lines already seein in current unit. nverselines = 0; # Number of lines already seen in current parag. lastwasblank = 0; # TRUE if last line written was a blank #-comment. lastword = ""; # Last word in current verse. load_lowercase_table("up-to-low.tbl"); } (abort >= 0) { exit abort; } /^ *[#]/ { if ($2 == "ALPHA") { $4 = remap_accents($4); } if ($2 == "BLANK") { $4 = "\"_\""; NF = 4; } lin = $0; gsub(/^ */,"",lin); output_comment_line(lin); next; } /.+[@]/ { data_error("@ not on column 1"); } /./ { # General contents and control line cleanup gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); gsub(/[ ]+/, " ", $0); } /^[@]verse[ ][A-Z][A-Z]+[ ][a-z][ ][0-9]+[ ][A-Z][ ][0-9]+[ ]*$/ { booktag = $2; book = $3; chapter = $4; unit = $5; verse = $6; if (book != curbook) { end_current_book(); } else if (chapter != curchapter) { end_current_chapter(); } else if (unit != curunit) { end_current_unit(); } else { if (verse == curverse) { data_error(("repeated verse \"" verse "\"")); } end_current_verse(); } # output_control_line($0); if (book != curbook) { begin_new_book(book); } if (chapter != curchapter) { begin_new_chapter(chapter); } if (unit != curunit) { begin_new_unit(unit); } begin_new_verse(verse); next; } /^[@]end[ ]*$/ { end_current_book(); # output_control_line($0); next; } /^[@]/ { data_error("invalid @ directive"); } /^ *$/ { next; } /^[@]/ { data_error("unknown @ directive"); } /./ { # Contents line, phew! fix_words(); output_contents_line(($0 " ")); next; } END { if (abort >= 0) { exit abort; } end_current_book(); insert_file("viqr-encoding.txt") } function fix_words( i,w) { # Apply case correction: for (i = 1; i <= NF; i++) { w = $(i); if (w in up_to_low) { w = up_to_low[w]; } $(i) = remap_accents(w); } } function remap_accents(w) { # # Remap accent codes to avoid confusion with punctuation: # # dot-below "." -> "°" # breve "(" -> "µ" # hook "?" -> "ß" # if ((w != ".") && (w != "...")) { gsub(/[.]/, "°", w); } if (w != "(") { gsub(/[\(]/, "µ", w); } if (w != "?") { gsub(/[?]/, "ß", w); } return w; } function end_current_book() { # Finishes off the current book, and sets curbook to "". if (curbook != "") { end_current_chapter(); printf "end book %s\n", curbook > "/dev/stderr"; curbook = ""; } else { if (curchapter != "") { data_error("inconsistent curchapter (0)"); } } } function end_current_chapter() { # Finishes off the current chapter, and increments "nchapters". if (curchapter != "") { end_current_unit(); curchapter = ""; printf "]\n" > "/dev/stderr"; } else { if (curunit != "") { data_error("inconsistent curunit"); } } } function end_current_unit() { # Finishes off the current unit. if (curunit != "") { end_current_verse(); curunit = ""; printf "%d)", nunitlines > "/dev/stderr"; nunitlines = 0; } else { if (curverse != "") { data_error("inconsistent curverse"); } if (nunutlines != 0) { data_error("inconsistent nunitlines"); } } } function end_current_verse() { # Finishes off the current paragraph, adding "@=" if necessary, if (curverse != "") { curverse = ""; if (nverselines > 0) { if (lastword ~ /^([.?!]|[.][.][.])$/) { output_contents_line("="); } else if (lastword !~ /^([:;,»]|[-][-])$/) { printf " «line %s : no punct» ", FNR > "/dev/stderr"; } output_comment_line("#"); } nverselines = 0; lastword = ""; } else { if (nverselines != 0) { data_error("inconsistent nverselines"); } } } function begin_new_book(book) { if (curbook != "") { data_error("inconsitent curbook (2)"); } curbook = book; printf "begin book %s\n", curbook > "/dev/stderr"; } function begin_new_chapter(chapter) { if (curbook == "") { data_error("unspecified book"); } if (curchapter != "") { data_error("inconsistent curchapter (2)"); } curchapter = chapter; output_comment_line(sprintf("## <%s%03d>", curbook, curchapter)); printf "[%d=", curchapter > "/dev/stderr"; } function begin_new_unit(unit) { # Initializes a new unit with given letter code. if (curchapter == "") { data_error("unspecified chapter"); } if (curunit != "") { data_error("inconsistent curunit (2)"); } if (nunutlines != 0) { data_error("inconsistent nunitlines (2)"); } curunit = unit; printf "(%s:", curunit > "/dev/stderr"; if (! lastwasblank) { output_comment_line("#"); } } function begin_new_verse(verse) { # Starts a new verse. if (curunit == "") { data_error("unspecified unit"); } if (curverse != "") { data_error("inconsistent curverse (2)"); } if (nverselines != 0) { data_error("inconsistent nverselines (2)"); } curverse = verse; } function output_contents_line(lin, loc,i) { nunitlines++; nverselines++; if (nverselines >= 10) { data_error(("too many lines in verse")); } else { curline = 10*verse + nverselines; } if (curbook == "") { data_error("unspecified book (2)"); } if (curchapter == "") { data_error("unspecified chapter (2)"); } if (curunit == "") { data_error("unspecified unit (2)"); } # Replace blanks by underscores: gsub(/[ ]+/, "_", lin); # Write line: loc = sprintf("<%s%03d.%s.%03d>", curbook, curchapter, curunit, curline); printf "%-18s %s\n", loc, lin; lastwasblank = 0; lastword = lin; gsub(/[_][_]*$/, "", lastword); gsub(/^.*[_]/, "", lastword); } function output_comment_line(lin) { # Outputs the comment line `lin', and sets `lastwasblank' gsub(/[ ]+$/, "", lin); if (lin !~ /^[#]/) { data_error("bad comment"); } lastwasblank = 0; if (lin ~ /^[#][ ]*$/) { lastwasblank = 1; } print lin; } function output_control_line(lin) { # Outputs an "@" line print lin; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function load_lowercase_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "up_to_low[ORIGINAL] = NEW". nMap=0; split("", up_to_low) while((getline lin < file) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld, " "); if (nfld != 2) error(("bad table entry = \"" lin "\"")); if (fld[1] in dic) error(("repeated key = \"" lin "\"")); up_to_low[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { error((file ": " ERRNO)); } close (file); if (nMap == 0) { arg_error(("file \"" file "\" empty or missing")); } printf "loaded %6d map pairs\n", nMap > "/dev/stderr" } function insert_file(file, nRead,lin,fld,nfld) { # Copies the specified file into the output stream, as comments nRead=0; while((getline lin < file) > 0) { gsub(/^[ \011]+/, "", lin); if (lin !~ /^[#]/) { lin = ("# " lin); } print lin; nRead++; } if (ERRNO != "0") { error((file ": " ERRNO)); } close (file); if (nRead == 0) { arg_error(("file \"" file "\" empty or missing")); } printf "copied %6d lines from %s\n", nRead, file > "/dev/stderr" }