#! /usr/bin/gawk -f
# Last edited on 2004-10-22 02:11:10 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " \\\n" \
    "  [ -v synFile=NAME.syn ] \\\n" \
    "  [ -v gprFile=NAME.gpr ] \\\n" \
    "  < main.src" \
  );
  
  # Reads the Omaha-Ponca corpus in temporary format. Writes to "{synFile}" a 
  # version where the tokens in each OP line are printed side-by-side
  # with their TR glosses. Also writes to the file "{gprFile}" the list
  # of all the OmahaPonca-English pairs from properly paired OP and TR lines.
  
  # Define a "text group" as a set of three or more matched lines from the
  # JOD corpus (in order, "<>rf", "<>pr", "<>op", "<>tr", possibly followed by
  # "<>nt", "<>nk", "<>ns", "<>xr"). The input file should have a
  # header "@section 4 {v??}" before each group.
  
  nttgroups = 0;  # Number of title groups seen
  ntxgroups = 0;  # Number of text groups seen
  noplines = 0; # Number of Omaha-Ponca lines.
  ntrlines = 0; # Number of gloss lines.
  noptoks = 0;  # Total number of Omaha-Ponka tokens read.
  ntrtoks = 0;  # Total number of glosses read.
  npairs = 0;   # Total glossing pairs written to {gprFile}.
  noplones = 0; # Total unmatched OP tokens written to {gprFile}.
  ntrlones = 0; # Total unmatched glosses written to {gprFile}.
  hadop = 0; # True iff current group had an op line
  hadtr = 0; # True iff current group had a tr line 
  loc = "{}"; # Last jod-locator seen.
  if (synFile != "") { printf "writing synchronized glosses to %s\n", synFile > "/dev/stderr"; }
  if (gprFile != "") { printf "writing Op-En glossary to %s\n", gprFile > "/dev/stderr"; }
}

(abort >= 0) { exit(abort); }

/^ *([\#]|$)/ {
  # Comment/blank line
  next;
}

/^[@]chars/ {
  # Charset declaration line - ignore
  next;
}

/^[@]section *[0-9] *{tt}/ {
  # Start of new title group.
  nttgroups++;
  # Check for incomplete groups:
  if (hadop != hadtr) { data_error("missing OP or TR line"); next; }
  # Reset state:
  loc = "{}";
  hadop = 0; hadtr = 0;
  err = 0;
  next;
}

/^[@]section *[0-9] *{v[0-9?]*}/ {
  # Start of new group.
  ntxgroups++;
  # Check for incomplete groups:
  if (hadop != hadtr) { data_error("missing OP or TR line"); next; }
  # Reset state:
  loc = "{}";
  hadop = 0; hadtr = 0;
  err = 0;
  next;
}

/^[<][>]rf/ {
  # Locator line
  # Remove "<>rf" and adjacent spaces:
  loc = $0;
  gsub(/^[<][>]rf */, "", loc);
  gsub(/ +$/, "", loc);
  # Check locator syntax:
  if ((loc !~ /^{jod[:]189[01]:[0-9]+[.][0-9]+}$/) && (loc !~ /^{sent[:][0-9]+}$/))
    { data_error(("malformed locator \"" loc "\"")); }
  # Print group locator and line number to output files:
  printf "\\rf %s = line %d\n", loc, FNR > synFile;
  printf "# %s %d\n", loc, FNR > gprFile;
  next;
}

/^[<][>]op/ { 
  # Omaha-ponca line
  noplines++;
  if (hadop) { data_error("multiple OP lines"); err = 1; next; }
  hadop = 1;
  if (hadtr) { data_error("TR line before OP line"); err = 1; next; }
  # Get whole line:
  lin = $0;
  # Remove "<>op {" and "}":
  gsub(/^[<][>]op *{/, "", lin);
  gsub(/} *$/, "", lin);
  # Make sure that "--"s have space after but not before:
  gsub(/[ ]*[-][-][ ]*/, "-- ", lin);
  # If the line contains any slashes, use them as token separators:
  if (lin ~ /[\/]/)
    { # Remove leading and trailing spaces of all slash-separated segments:
      gsub(/^[ ]+/, "", lin);
      gsub(/[ ]+$/, "", lin);
      gsub(/[ ]+[\/]/, "/", lin);
      gsub(/[\/][ ]+/, "/", lin);
      # Replace remaining blanks by glossing joiners "=":
      gsub(/[ ]+/, "=", lin);
      # Now replace slashes by spaces (preserving multiplicity).
      gsub(/[\/]/, " ", lin);
    }
  else
    { # Remove leading and trailing spaces, reduce multiple spaces:
      gsub(/^[ ]+/, "", lin);
      gsub(/[ ]+$/, "", lin);
      gsub(/[ ][ ]+/, " ", lin);
    }
  # Split into separate tokens at spaces:
  mop = split(lin, wop); # {wop[1..mop]} are the raw OP tokens.
  noptoks += mop;
  next;
}

/^[<][>]tr/ {
  # Gloss line
  ntrlines++;
  if (hadtr) { data_error("multiple TR lines"); err = 1; next; }
  hadtr = 1;
  if (! hadop) { data_error("TR line before OP line"); err = 1; next; }
  # Get whole line:
  lin = $0;
  # Remove "<>tr {" and "}":
  gsub(/^[<][>]tr *{/, "", lin);
  gsub(/} *$/, "", lin);
  # Remove leading and trailing spaces of all slash-separated segments:
  gsub(/^[ ]+/, "", lin);
  gsub(/[ ]+$/, "", lin);
  gsub(/[ ]+[\/]/, "/", lin);
  gsub(/[\/][ ]+/, "/", lin);
  # Reduce multiple internal spaces: 
  gsub(/[ ][ ]+/, " ", lin);
  # Split at slashes:
  mtr = split(lin, wtr, "/"); # {wtr[1..mtr]} are the raw glosses.
  ntrtoks += mtr;
  # Try to pair up tokens and glosses
  process_tokens();
  next;
}

/./ { 
  # Other lines - just print them
  lin = $0;
  # Change the "<>" into "\\":
  gsub(/^[<][>]/, "\\", lin);
  printf "%s\n", lin > synFile;
  next;
}

function process_tokens(    mm,i,o,t)
{
  # Assumes defined 
  #   {wop[1..mop]} - Omaha-Ponca tokens,
  #   {wtr[1..mtr]} - Corresponding glosses
  
  # Max number of tokens or glosses:
  mm = ( mop > mtr ? mop : mtr );
  
  if (mop == mtr) 
    { # Print glossing pairs to {gprFile}:
      for (i = 1; i <= mm; i++)
        { # Get {i}th OP token {o} and matching gloss {t}, lowercased and cleaned:
          o = op_cleanup(wop[i]); t = tr_cleanup(wtr[i]);
          # Map blanks and glossing joiners to "_":
          gsub(/[ =]/, "_", o); gsub(/[ =]/, "_", t);
          printf "%s %s\n", o, t > gprFile;
          npairs++;
        }
    }
  else
    { pairing_error(("OP/TR token mismatch - mop = " mop "  mtr = " mtr));
      # Print OP tokens and glossing tokens separately, unpaired:
      for (i = 1; i <= mop; i++)
        { # Get {i}th OP token {o}, lowercased and cleaned:
          o = op_cleanup(wop[i]); t = "×";
          # Map blanks and glossing joiners to "_":
          gsub(/[ =]/, "_", o);
          printf "%s %s\n", o, t > gprFile;
          noplones++;
        }
      for (i = 1; i <= mtr; i++)
        { # Get {i}th gloss {t}, lowercased and cleaned:
          o = "×"; t = tr_cleanup(wtr[i]);
          # Map blanks and glossing joiners to "_":
          gsub(/[ =]/, "_", t);
          printf "%s %s\n", o, t > gprFile;
          ntrlones++;
        }
    }

  # Print aligned tokens:
  for (i = 1; i <= mm; i++)
    { # Get {i}th raw OP token {o} and raw gloss {t},
      o = (i <= mop ? wop[i] : "×");
      t = (i <= mtr ? wtr[i] : "×");
      # Replace glossing joiner "=" by spaces:
      gsub(/[=]/, " ", o);
      gsub(/[=]/, " ", t);
      # Print ostensibly matched token pair:
      printf "\\ot %30s %s\n", o, t > synFile;
    }
  printf "\n" > synFile;
}

function op_cleanup(x)
{
  # Remove trailing and leading punctuation from OP token {x}.
  # Keep internal punctuation, but remove any "=" that were
  # inserted between words and leading/trailing puncts.
  # Beware that "?" is a phoneme except word-finally.
  gsub(/[-][-]$/, "", x); 
  gsub(/^[][().,:;!"=_]+/, "", x);
  gsub(/[][().,:;!?"=_]+$/, "", x);
  # Map to lower case:
  x = tolower(x);
  gsub(/[Á]/, "á", x);
  gsub(/[É]/, "é", x);
  gsub(/[Í]/, "í", x);
  gsub(/[Ó]/, "ó", x);
  gsub(/[Ú]/, "ú", x);

  gsub(/[Ä]/, "ä", x);
  gsub(/[Ë]/, "ë", x);
  gsub(/[Ï]/, "ï", x);
  gsub(/[Ö]/, "ö", x);
  gsub(/[Ü]/, "ü", x);

  gsub(/[Â]/, "â", x);
  gsub(/[Ê]/, "ê", x);
  gsub(/[Î]/, "î", x);
  gsub(/[Ô]/, "ô", x);
  gsub(/[Û]/, "û", x);

  # Remove superfluous spaces
  gsub(/^[ ]+/, "", x);
  gsub(/[ ]+$/, "", x);
  gsub(/[ ][ ]+/, " ", x);
  # Ward off against empty words:
  if (x == "") { x = "EMPTY"; }
  return x;
}

function tr_cleanup(x)
{
  # Remove trailing and leading punctuation from gloss {x}.
  # Keep internal punctuation.
  gsub(/[-][-]$/, "", x); 
  gsub(/^[][().,:;!?"_]+/, "", x);
  gsub(/[][().,:;!?"_]+$/, "", x);
  # Map to lower case:
  x = tolower(x);
  gsub(/[Á]/, "á", x);
  gsub(/[É]/, "é", x);
  gsub(/[Í]/, "í", x);
  gsub(/[Ó]/, "ó", x);
  gsub(/[Ú]/, "ú", x);

  gsub(/[Ä]/, "ä", x);
  gsub(/[Ë]/, "ë", x);
  gsub(/[Ï]/, "ï", x);
  gsub(/[Ö]/, "ö", x);
  gsub(/[Ü]/, "ü", x);

  gsub(/[Â]/, "â", x);
  gsub(/[Ê]/, "ê", x);
  gsub(/[Î]/, "î", x);
  gsub(/[Ô]/, "ô", x);
  gsub(/[Û]/, "û", x);

  # Remove superfluous spaces
  gsub(/^[ ]+/, "", x);
  gsub(/[ ]+$/, "", x);
  gsub(/[ ][ ]+/, " ", x);
  # Ward off against empty glosses:
  if (x == "") { x = "EMPTY"; }
  return x;
}

END {
  fflush(synFile);
  if (abort >= 0) { exit(abort); }
  # Check for incomplete groups:
  if (hadop != hadtr) { data_error("missing OP or TR line"); }
  # Print stats
  printf "%7d title groups read\n", nttgroups > "/dev/stderr";
  printf "%7d text groups read\n", ntxgroups > "/dev/stderr";
  printf "%7d OP lines read (%d tokens, %d unpaired)\n", \
    noplines, noptoks, noplones > "/dev/stderr";
  printf "%7d TR lines read (%d glosses,%d unpaired)\n", \
    ntrlines, ntrtoks, ntrlones > "/dev/stderr";
  printf "%7d glossing pairs written\n", npairs > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function pairing_error(msg)
{
  printf "\\** line %d: %s\n", FNR, msg > synFile;
}

function data_error(msg)
{
  printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr";
  printf "\\** line %d: %s\n", FNR, msg > synFile;
}