#! /usr/bin/gawk -f
# Last edited on 2004-10-13 16:51:27 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " < jd-1890-1981.txt > orig.jek" );
  
  # Reads the Omaha-Ponca corpus in J. E. Koontz's original format. 
  # Applies some basic cleanup that preserves the format and 
  # meaningfule contents, but makes it easier to compare against 
  # J. Stolfi's version:
  
  #   * Delete final "/"s.
  #   * Join continuation lines.
  #   * Make sure that the "ü" and "ı" modifiers are always in that order.
  
  lin="";
}

(abort >= 0) { exit(abort); }

// { 
  # All lines:
  # Normalize order of accents:
  gsub(/[ı][ü]/, "üı", $0); 
  # Remove spurious "/"s at end of lines:
  gsub(/[\/ ]*$/, "", $0);
}

/^ *$/ {
  # Blank lines - flush current line:
  flush_line();
  # LEave blank line:
  print "";
  next;
}

/^[\\]/ {
  # Tagged lines - flush current line:
  flush_line();
  # Start a new one:
  lin = $0;
  next;
}

// {
  # Non-tagged line - assume it is continuation.
  # Remove leading blanks:
  gsub(/^[ ]+/, "", $0);
  # Join with current line:
  if (lin ~ /(^|[^-])[-]$/)
    { # Previous line ends with hyphen; join without space:
      lin = ( lin $0 );
    }
  else
    { # Join with space:
      lin = ( lin " " $0 );
    }
  next;
}

END {
  if (abort >= 0) { exit abort; }
  # Flush current line:
  flush_line();
}

function flush_line()
{ 
  if(lin != "") { print lin; } 
  lin = "";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
}