#! /usr/bin/gawk -f # Last edited on 2004-10-13 16:51:27 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " < jd-1890-1981.txt > orig.jek" ); # Reads the Omaha-Ponca corpus in J. E. Koontz's original format. # Applies some basic cleanup that preserves the format and # meaningfule contents, but makes it easier to compare against # J. Stolfi's version: # * Delete final "/"s. # * Join continuation lines. # * Make sure that the "ü" and "ý" modifiers are always in that order. lin=""; } (abort >= 0) { exit(abort); } // { # All lines: # Normalize order of accents: gsub(/[ý][ü]/, "üý", $0); # Remove spurious "/"s at end of lines: gsub(/[\/ ]*$/, "", $0); } /^ *$/ { # Blank lines - flush current line: flush_line(); # LEave blank line: print ""; next; } /^[\\]/ { # Tagged lines - flush current line: flush_line(); # Start a new one: lin = $0; next; } // { # Non-tagged line - assume it is continuation. # Remove leading blanks: gsub(/^[ ]+/, "", $0); # Join with current line: if (lin ~ /(^|[^-])[-]$/) { # Previous line ends with hyphen; join without space: lin = ( lin $0 ); } else { # Join with space: lin = ( lin " " $0 ); } next; } END { if (abort >= 0) { exit abort; } # Flush current line: flush_line(); } function flush_line() { if(lin != "") { print lin; } lin = ""; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; }