#! /usr/bin/gawk -f # Last edited on 2004-10-14 02:21:07 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " [-v keepComments=BOOL] < main.src > main.jek" ); # Reads the Omaha-Ponca corpus in J. Stolfi's current format. Writes # to standard output a best approximation to J. E. Koontz's original # format. # Excluding comment lines, JS format has `title groups' and `text # groups'. # # A title group starts with "@section N {tt}" and contains fields # "<>tt", "<>st", "<>dt", "<>au". # # A text group starts with "@section 4 {v??}" and contains in # order lines "<>rf", "<>pr", "<>op", "<>tr", "<>nt", "<>nk", # "<>ns", "<>xr". # In JEK format, the reference number comes last in the block. if (keepComments == "") { keepComments = 0; } nttgroups = 0; # Number of title groups seen ntxgroups = 0; # Number of text groups seen loc = "{}"; # Last jod-locator seen. grouptype = ""; # Type of current group ("tx" or "tt" or "") } (abort >= 0) { exit(abort); } /^ *([\#]|$)/ { # Comment/blank line - ignore unless user asked to keep if (keepComments) { lin = convert_en_line($0); print lin; } next; } /^[@]chars/ { # Charset declaration line - ignore next; } /^[@]section *[0-9] *{tt[0-9]*}/ { # Start of new title group. nttgroups++; if (grouptype != "") { finish_group(); } grouptype = "tt"; loc = "{}"; next; } /^[@]section *[0-9] *{v[0-9?]*}/ { # Start of new text group. ntxgroups++; if (grouptype != "") { finish_group(); } grouptype = "tx"; # Reset state: loc = "{}"; next; } /^[<][>]rf/ { # Locator line # Remove "<>rf" and adjacent spaces: loc = $0; gsub(/^[<][>]rf */, "", loc); gsub(/ +$/, "", loc); if (grouptype != "tx") { data_error(("locator outside of a text group \"" loc "\"")); } # Check locator syntax: if ((loc !~ /^{jod[:]189[01]:[0-9]+[.][0-9]+}$/) && (loc !~ /^{sent[:][0-9]+}$/)) { data_error(("malformed locator \"" loc "\"")); } next; } /^[<][>](tt|st|dt|au|pr|op|tr|nt|nk|ns|xr)/ { # Data line if (grouptype == "") { data_error(("data line not in any group")); } # Get input line tag: tag = $1; gsub(/^[<][>] */, "", tag); # Get input line, remove "<>", tag, braces, and adjacent spaces: lin = $0; gsub(/^[<][>][a-z][a-z] *{ */, "", lin); gsub(/ *} *$/, "", lin); # Process line and output it if (tag == "op") { # Omaha-Ponca line, perhaps with embedded non-OP text marked &...: lin = convert_op_line(lin); xtag = "op" } else { # English/Latin line, perhaps with embedded OP text marked @{...}: if (tag == "tr") { # Glossing line # Remove "%" from empty glosses gsub(/[%]/, "", lin); } lin = convert_en_line(lin); if (tag == "ns") { lin = ( lin " [JS]"); xtag = "nt" } else if (tag == "nk") { lin = ( lin " [JEK]"); xtag = "nt" } else if (tag == "tt") { xtag = "ti" } else { xtag = tag; } } printf "\\%s %s\n", xtag, lin; next; } /./ { # Unknown line tag: data_error(("unrecognized line")); next; } function finish_group( xloc) { if (grouptype == "tt") { # Nothing to do } else if (grouptype == "tx") { # Print locator line: if (loc == "{}") { data_error(("missing locator")); } # Convert locato to JEK format: xloc = loc; gsub(/[{}]/, "", xloc); gsub(/jod:/, "jod ", xloc); gsub(/sent:/, "sent ", xloc); printf "\\rf %s\n", xloc; } printf "\n"; } function convert_en_line(lin, en,op,res) { res = ""; while (lin != "") { # Extract the leading chunk {en} of non-OP text ("" if none) # and the following OP text {op} ("" if none). if (match(lin, /[@][{][^{}]*[}]/)) { # Extract embedded OP text: en = substr(lin, 1, RSTART-1); op = substr(lin, RSTART+2, RLENGTH-3); lin = substr(lin, RSTART+RLENGTH); } else { # Get trainling non-OP text: en = lin; op = ""; lin = ""; } # Convert accent encoding of OP text, restore its markup: if (op != "") { op = ( "@{" convert_op_encoding(op) "}" ); } res = ( res en op ); } return res; } function convert_op_line(lin, en,op,res) { res = ""; while (lin != "") { # Extract the leading chunk {op} OP text ("" if none) # and the following chunk {en} of non-OP text ("" if none). if (match(lin, /[&][A-ZÈa-zè]*/)) { # Extract embedded non-OP text: op = substr(lin, 1, RSTART-1); en = substr(lin, RSTART+1, RLENGTH-1); lin = substr(lin, RSTART+RLENGTH); } else { # Get trainling OP text: op = lin; en = ""; lin = ""; } # Convert accent encoding of OP text: op = convert_op_encoding(op); # Restore markup of non-OP text: if (en != "") { en = ( "&" en ); } res = ( res op en ); } return res; } function convert_op_encoding(x) { # Converts a pure OP text from the JS encoding to the JEK encoding. # Expand [äëïöü] into unaccented vowel [aeiou] plus nasalization "~": gsub(/[ä]/, "a~", x); gsub(/[ë]/, "e~", x); gsub(/[ï]/, "i~", x); gsub(/[ö]/, "o~", x); gsub(/[ü]/, "u~", x); gsub(/[Ä]/, "A~", x); gsub(/[Ë]/, "E~", x); gsub(/[Ï]/, "I~", x); gsub(/[Ö]/, "O~", x); gsub(/[Ü]/, "U~", x); # # Expand [âêîôû] into accented vowel[áéíóú] plus nasalization: gsub(/[â]/, "á~", x); gsub(/[ê]/, "é~", x); gsub(/[î]/, "í~", x); gsub(/[ô]/, "ó~", x); gsub(/[û]/, "ú~", x); gsub(/[Â]/, "Á~", x); gsub(/[Ê]/, "É~", x); gsub(/[Î]/, "Í~", x); gsub(/[Ô]/, "Ó~", x); gsub(/[Û]/, "Ú~", x); # # Expand [áéíóú] into accent marker "ý" and vowel [aeiou]: gsub(/[á]/, "aý", x); gsub(/[é]/, "eý", x); gsub(/[í]/, "iý", x); gsub(/[ó]/, "oý", x); gsub(/[ú]/, "uý", x); gsub(/[Á]/, "Aý", x); gsub(/[É]/, "Eý", x); gsub(/[Í]/, "Iý", x); gsub(/[Ó]/, "Oý", x); gsub(/[Ú]/, "Uý", x); # # Replace "¿" or "¡" by "?": gsub(/[¿¡]/, "?", x); # # Change nasalization marker from "~" to "ü": gsub(/[~]/, "ü", x); # # JEK prefers the nasalization marker before the accent marker: gsub(/[ý][ü]/, "üý", x); return x; } END { if (abort >= 0) { exit(abort); } # Check for incomplete groups: if (grouptype != "") { finish_group(); } # Print stats printf "%7d title groups read\n", nttgroups > "/dev/stderr"; printf "%7d text groups read\n", ntxgroups > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr"; }