#! /usr/bin/gawk -f # Last edited on 2004-10-22 02:11:10 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " [ -v synFile=NAME.syn ] \\\n" \ " [ -v gprFile=NAME.gpr ] \\\n" \ " < main.src" \ ); # Reads the Omaha-Ponca corpus in temporary format. Writes to "{synFile}" a # version where the tokens in each OP line are printed side-by-side # with their TR glosses. Also writes to the file "{gprFile}" the list # of all the OmahaPonca-English pairs from properly paired OP and TR lines. # Define a "text group" as a set of three or more matched lines from the # JOD corpus (in order, "<>rf", "<>pr", "<>op", "<>tr", possibly followed by # "<>nt", "<>nk", "<>ns", "<>xr"). The input file should have a # header "@section 4 {v??}" before each group. nttgroups = 0; # Number of title groups seen ntxgroups = 0; # Number of text groups seen noplines = 0; # Number of Omaha-Ponca lines. ntrlines = 0; # Number of gloss lines. noptoks = 0; # Total number of Omaha-Ponka tokens read. ntrtoks = 0; # Total number of glosses read. npairs = 0; # Total glossing pairs written to {gprFile}. noplones = 0; # Total unmatched OP tokens written to {gprFile}. ntrlones = 0; # Total unmatched glosses written to {gprFile}. hadop = 0; # True iff current group had an op line hadtr = 0; # True iff current group had a tr line loc = "{}"; # Last jod-locator seen. if (synFile != "") { printf "writing synchronized glosses to %s\n", synFile > "/dev/stderr"; } if (gprFile != "") { printf "writing Op-En glossary to %s\n", gprFile > "/dev/stderr"; } } (abort >= 0) { exit(abort); } /^ *([\#]|$)/ { # Comment/blank line next; } /^[@]chars/ { # Charset declaration line - ignore next; } /^[@]section *[0-9] *{tt}/ { # Start of new title group. nttgroups++; # Check for incomplete groups: if (hadop != hadtr) { data_error("missing OP or TR line"); next; } # Reset state: loc = "{}"; hadop = 0; hadtr = 0; err = 0; next; } /^[@]section *[0-9] *{v[0-9?]*}/ { # Start of new group. ntxgroups++; # Check for incomplete groups: if (hadop != hadtr) { data_error("missing OP or TR line"); next; } # Reset state: loc = "{}"; hadop = 0; hadtr = 0; err = 0; next; } /^[<][>]rf/ { # Locator line # Remove "<>rf" and adjacent spaces: loc = $0; gsub(/^[<][>]rf */, "", loc); gsub(/ +$/, "", loc); # Check locator syntax: if ((loc !~ /^{jod[:]189[01]:[0-9]+[.][0-9]+}$/) && (loc !~ /^{sent[:][0-9]+}$/)) { data_error(("malformed locator \"" loc "\"")); } # Print group locator and line number to output files: printf "\\rf %s = line %d\n", loc, FNR > synFile; printf "# %s %d\n", loc, FNR > gprFile; next; } /^[<][>]op/ { # Omaha-ponca line noplines++; if (hadop) { data_error("multiple OP lines"); err = 1; next; } hadop = 1; if (hadtr) { data_error("TR line before OP line"); err = 1; next; } # Get whole line: lin = $0; # Remove "<>op {" and "}": gsub(/^[<][>]op *{/, "", lin); gsub(/} *$/, "", lin); # Make sure that "--"s have space after but not before: gsub(/[ ]*[-][-][ ]*/, "-- ", lin); # If the line contains any slashes, use them as token separators: if (lin ~ /[\/]/) { # Remove leading and trailing spaces of all slash-separated segments: gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ]+[\/]/, "/", lin); gsub(/[\/][ ]+/, "/", lin); # Replace remaining blanks by glossing joiners "=": gsub(/[ ]+/, "=", lin); # Now replace slashes by spaces (preserving multiplicity). gsub(/[\/]/, " ", lin); } else { # Remove leading and trailing spaces, reduce multiple spaces: gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); } # Split into separate tokens at spaces: mop = split(lin, wop); # {wop[1..mop]} are the raw OP tokens. noptoks += mop; next; } /^[<][>]tr/ { # Gloss line ntrlines++; if (hadtr) { data_error("multiple TR lines"); err = 1; next; } hadtr = 1; if (! hadop) { data_error("TR line before OP line"); err = 1; next; } # Get whole line: lin = $0; # Remove "<>tr {" and "}": gsub(/^[<][>]tr *{/, "", lin); gsub(/} *$/, "", lin); # Remove leading and trailing spaces of all slash-separated segments: gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ]+[\/]/, "/", lin); gsub(/[\/][ ]+/, "/", lin); # Reduce multiple internal spaces: gsub(/[ ][ ]+/, " ", lin); # Split at slashes: mtr = split(lin, wtr, "/"); # {wtr[1..mtr]} are the raw glosses. ntrtoks += mtr; # Try to pair up tokens and glosses process_tokens(); next; } /./ { # Other lines - just print them lin = $0; # Change the "<>" into "\\": gsub(/^[<][>]/, "\\", lin); printf "%s\n", lin > synFile; next; } function process_tokens( mm,i,o,t) { # Assumes defined # {wop[1..mop]} - Omaha-Ponca tokens, # {wtr[1..mtr]} - Corresponding glosses # Max number of tokens or glosses: mm = ( mop > mtr ? mop : mtr ); if (mop == mtr) { # Print glossing pairs to {gprFile}: for (i = 1; i <= mm; i++) { # Get {i}th OP token {o} and matching gloss {t}, lowercased and cleaned: o = op_cleanup(wop[i]); t = tr_cleanup(wtr[i]); # Map blanks and glossing joiners to "_": gsub(/[ =]/, "_", o); gsub(/[ =]/, "_", t); printf "%s %s\n", o, t > gprFile; npairs++; } } else { pairing_error(("OP/TR token mismatch - mop = " mop " mtr = " mtr)); # Print OP tokens and glossing tokens separately, unpaired: for (i = 1; i <= mop; i++) { # Get {i}th OP token {o}, lowercased and cleaned: o = op_cleanup(wop[i]); t = "×"; # Map blanks and glossing joiners to "_": gsub(/[ =]/, "_", o); printf "%s %s\n", o, t > gprFile; noplones++; } for (i = 1; i <= mtr; i++) { # Get {i}th gloss {t}, lowercased and cleaned: o = "×"; t = tr_cleanup(wtr[i]); # Map blanks and glossing joiners to "_": gsub(/[ =]/, "_", t); printf "%s %s\n", o, t > gprFile; ntrlones++; } } # Print aligned tokens: for (i = 1; i <= mm; i++) { # Get {i}th raw OP token {o} and raw gloss {t}, o = (i <= mop ? wop[i] : "×"); t = (i <= mtr ? wtr[i] : "×"); # Replace glossing joiner "=" by spaces: gsub(/[=]/, " ", o); gsub(/[=]/, " ", t); # Print ostensibly matched token pair: printf "\\ot %30s %s\n", o, t > synFile; } printf "\n" > synFile; } function op_cleanup(x) { # Remove trailing and leading punctuation from OP token {x}. # Keep internal punctuation, but remove any "=" that were # inserted between words and leading/trailing puncts. # Beware that "?" is a phoneme except word-finally. gsub(/[-][-]$/, "", x); gsub(/^[][().,:;!"=_]+/, "", x); gsub(/[][().,:;!?"=_]+$/, "", x); # Map to lower case: x = tolower(x); gsub(/[Á]/, "á", x); gsub(/[É]/, "é", x); gsub(/[Í]/, "í", x); gsub(/[Ó]/, "ó", x); gsub(/[Ú]/, "ú", x); gsub(/[Ä]/, "ä", x); gsub(/[Ë]/, "ë", x); gsub(/[Ï]/, "ï", x); gsub(/[Ö]/, "ö", x); gsub(/[Ü]/, "ü", x); gsub(/[Â]/, "â", x); gsub(/[Ê]/, "ê", x); gsub(/[Î]/, "î", x); gsub(/[Ô]/, "ô", x); gsub(/[Û]/, "û", x); # Remove superfluous spaces gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); gsub(/[ ][ ]+/, " ", x); # Ward off against empty words: if (x == "") { x = "EMPTY"; } return x; } function tr_cleanup(x) { # Remove trailing and leading punctuation from gloss {x}. # Keep internal punctuation. gsub(/[-][-]$/, "", x); gsub(/^[][().,:;!?"_]+/, "", x); gsub(/[][().,:;!?"_]+$/, "", x); # Map to lower case: x = tolower(x); gsub(/[Á]/, "á", x); gsub(/[É]/, "é", x); gsub(/[Í]/, "í", x); gsub(/[Ó]/, "ó", x); gsub(/[Ú]/, "ú", x); gsub(/[Ä]/, "ä", x); gsub(/[Ë]/, "ë", x); gsub(/[Ï]/, "ï", x); gsub(/[Ö]/, "ö", x); gsub(/[Ü]/, "ü", x); gsub(/[Â]/, "â", x); gsub(/[Ê]/, "ê", x); gsub(/[Î]/, "î", x); gsub(/[Ô]/, "ô", x); gsub(/[Û]/, "û", x); # Remove superfluous spaces gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); gsub(/[ ][ ]+/, " ", x); # Ward off against empty glosses: if (x == "") { x = "EMPTY"; } return x; } END { fflush(synFile); if (abort >= 0) { exit(abort); } # Check for incomplete groups: if (hadop != hadtr) { data_error("missing OP or TR line"); } # Print stats printf "%7d title groups read\n", nttgroups > "/dev/stderr"; printf "%7d text groups read\n", ntxgroups > "/dev/stderr"; printf "%7d OP lines read (%d tokens, %d unpaired)\n", \ noplines, noptoks, noplones > "/dev/stderr"; printf "%7d TR lines read (%d glosses,%d unpaired)\n", \ ntrlines, ntrtoks, ntrlones > "/dev/stderr"; printf "%7d glossing pairs written\n", npairs > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function pairing_error(msg) { printf "\\** line %d: %s\n", FNR, msg > synFile; } function data_error(msg) { printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr"; printf "\\** line %d: %s\n", FNR, msg > synFile; }