#! /usr/bin/gawk -f
# Last edited on 2004-10-14 02:21:07 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " [-v keepComments=BOOL] < main.src > main.jek" );
  
  # Reads the Omaha-Ponca corpus in J. Stolfi's current format. Writes
  # to standard output a best approximation to J. E. Koontz's original 
  # format.
  
  # Excluding comment lines, JS format has `title groups' and `text
  # groups'.
  #   
  #   A title group starts with "@section N {tt}" and contains fields
  #   "<>tt", "<>st", "<>dt", "<>au".
  #   
  #   A text group starts with "@section 4 {v??}" and contains in
  #   order lines "<>rf", "<>pr", "<>op", "<>tr", "<>nt", "<>nk",
  #   "<>ns", "<>xr".
  
  # In JEK format, the reference number comes last in the block.

  if (keepComments == "") { keepComments = 0; }
  
  nttgroups = 0;  # Number of title groups seen
  ntxgroups = 0;  # Number of text groups seen
  
  loc = "{}"; # Last jod-locator seen.
  grouptype = "";  # Type of current group ("tx" or "tt" or "")
  
}

(abort >= 0) { exit(abort); }

/^ *([\#]|$)/ {
  # Comment/blank line - ignore unless user asked to keep
  if (keepComments) 
    { lin = convert_en_line($0);
      print lin; 
    }
  next;
}

/^[@]chars/ {
  # Charset declaration line - ignore
  next;
}

/^[@]section *[0-9] *{tt[0-9]*}/ {
  # Start of new title group.
  nttgroups++;
  if (grouptype != "") { finish_group(); }
  grouptype = "tt";
  loc = "{}";
  next;
}

/^[@]section *[0-9] *{v[0-9?]*}/ {
  # Start of new text group.
  ntxgroups++;
  if (grouptype != "") { finish_group(); }
  grouptype = "tx";
  # Reset state:
  loc = "{}";
  next;
}

/^[<][>]rf/ {
  # Locator line
  # Remove "<>rf" and adjacent spaces:
  loc = $0;
  gsub(/^[<][>]rf */, "", loc);
  gsub(/ +$/, "", loc);
  if (grouptype != "tx")  
    { data_error(("locator outside of a text group \"" loc "\"")); }
  # Check locator syntax:
  if ((loc !~ /^{jod[:]189[01]:[0-9]+[.][0-9]+}$/) && (loc !~ /^{sent[:][0-9]+}$/))
    { data_error(("malformed locator \"" loc "\"")); }
  next;
}

/^[<][>](tt|st|dt|au|pr|op|tr|nt|nk|ns|xr)/ {
  # Data line
  if (grouptype == "")  
    { data_error(("data line not in any group")); }
  # Get input line tag:
  tag = $1;
  gsub(/^[<][>] */, "", tag);
  # Get input line, remove "<>", tag, braces, and adjacent spaces:
  lin = $0;
  gsub(/^[<][>][a-z][a-z] *{ */, "", lin);
  gsub(/ *} *$/, "", lin);
  # Process line and output it
  if (tag == "op")
    { # Omaha-Ponca line, perhaps with embedded non-OP text marked &...:
      lin = convert_op_line(lin);
      xtag = "op"
    }
  else 
    { # English/Latin line, perhaps with embedded OP text marked @{...}:
      if (tag == "tr")
        { # Glossing line
          # Remove "%" from empty glosses
          gsub(/[%]/, "", lin);
        }
      lin = convert_en_line(lin);
      if (tag == "ns")
        { lin = ( lin " [JS]"); xtag = "nt" }
      else if (tag == "nk")
        { lin = ( lin " [JEK]"); xtag = "nt" }
      else if (tag == "tt")
        { xtag = "ti" }
      else
        { xtag = tag; }
    }
  printf "\\%s %s\n", xtag, lin;
  next;
}

/./ { 
  # Unknown line tag:
  data_error(("unrecognized line"));
  next;
}

function finish_group(   xloc)
{
  if (grouptype == "tt")
    { # Nothing to do 
    }
  else if (grouptype == "tx")
    { # Print locator line:
      if (loc == "{}") { data_error(("missing locator")); }
      # Convert locato to JEK format:
      xloc = loc;
      gsub(/[{}]/, "", xloc);
      gsub(/jod:/, "jod ", xloc);
      gsub(/sent:/, "sent ", xloc);
      printf "\\rf %s\n", xloc;
    }
  printf "\n";
}

function convert_en_line(lin,   en,op,res)
{
  res = "";
  while (lin != "")
    { # Extract the leading chunk {en} of non-OP text ("" if none)
      # and the following OP text {op} ("" if none).
      if (match(lin, /[@][{][^{}]*[}]/))
        { # Extract embedded OP text:
          en = substr(lin, 1, RSTART-1);
          op = substr(lin, RSTART+2, RLENGTH-3);
          lin = substr(lin, RSTART+RLENGTH);
        }
      else
        { # Get trainling non-OP text:
          en = lin; op = ""; lin = "";
        }
      # Convert accent encoding of OP text, restore its markup:
      if (op != "") { op = ( "@{" convert_op_encoding(op) "}" ); }
      res = ( res en op );
    }
  return res;
}

function convert_op_line(lin,   en,op,res)
{
  res = "";
  while (lin != "")
    { # Extract the leading chunk {op} OP text ("" if none)
      # and the following chunk {en} of non-OP text ("" if none).
      if (match(lin, /[&][A-ZÈa-zè]*/))
        { # Extract embedded non-OP text:
          op = substr(lin, 1, RSTART-1);
          en = substr(lin, RSTART+1, RLENGTH-1);
          lin = substr(lin, RSTART+RLENGTH);
        }
      else
        { # Get trainling OP text:
          op = lin; en = ""; lin = "";
        }
      # Convert accent encoding of OP text:
      op = convert_op_encoding(op);
      # Restore markup of non-OP text:
      if (en != "") { en = ( "&" en ); }
      res = ( res  op en );
    }
  return res;
}

function convert_op_encoding(x)
{
  # Converts a pure OP text from the JS encoding to the JEK encoding.
  # Expand [äëïöü] into unaccented vowel [aeiou] plus nasalization "~":
  gsub(/[ä]/, "a~", x);
  gsub(/[ë]/, "e~", x);
  gsub(/[ï]/, "i~", x);
  gsub(/[ö]/, "o~", x);
  gsub(/[ü]/, "u~", x);
  gsub(/[Ä]/, "A~", x);
  gsub(/[Ë]/, "E~", x);
  gsub(/[Ï]/, "I~", x);
  gsub(/[Ö]/, "O~", x);
  gsub(/[Ü]/, "U~", x);
  #
  # Expand [âêîôû] into accented vowel[áéíóú] plus nasalization:
  gsub(/[â]/, "á~", x);
  gsub(/[ê]/, "é~", x);
  gsub(/[î]/, "í~", x);
  gsub(/[ô]/, "ó~", x);
  gsub(/[û]/, "ú~", x);
  gsub(/[Â]/, "Á~", x);
  gsub(/[Ê]/, "É~", x);
  gsub(/[Î]/, "Í~", x);
  gsub(/[Ô]/, "Ó~", x);
  gsub(/[Û]/, "Ú~", x);
  #
  # Expand [áéíóú] into accent marker "ý" and vowel [aeiou]:
  gsub(/[á]/, "aý", x);
  gsub(/[é]/, "eý", x);
  gsub(/[í]/, "iý", x);
  gsub(/[ó]/, "oý", x);
  gsub(/[ú]/, "uý", x);
  gsub(/[Á]/, "Aý", x);
  gsub(/[É]/, "Eý", x);
  gsub(/[Í]/, "Iý", x);
  gsub(/[Ó]/, "Oý", x);
  gsub(/[Ú]/, "Uý", x);
  #
  # Replace "¿" or "¡" by "?":
  gsub(/[¿¡]/, "?", x);
  #
  # Change nasalization marker from "~" to "ü":
  gsub(/[~]/, "ü", x);
  #
  # JEK prefers the nasalization marker before the accent marker:
  gsub(/[ý][ü]/, "üý", x);
  return x;
}

END {
  if (abort >= 0) { exit(abort); }
  # Check for incomplete groups:
  if (grouptype != "") { finish_group(); }
  # Print stats
  printf "%7d title groups read\n", nttgroups > "/dev/stderr";
  printf "%7d text groups read\n", ntxgroups > "/dev/stderr";
}

function arg_error(msg)
{
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1;
}

function data_error(msg)
{
  printf "%s:%d: %s ** %s\n", FILENAME, FNR, loc, msg > "/dev/stderr";
}