#! /usr/bin/gawk -f
# Last edited on 2000-05-23 16:06:41 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "  [ -v omods=BOOL ] < INFILE.elt > OUTFILE.wsp" );

  # Reads a stream of element-factored words, outputs a stream of words
  # with nested parentheses/brackets around mantle and core components.
  # 
  # A "core" component is defined as a single element containing a
  # gallows letter. Each core component gets surrounded by angle
  # brackets "<>"
  # 
  # A "mantle" component is a maximal consecutive sequence of mantle
  # elements. A mantle element is one that contains no gallows 
  # but contains one of the letters "ch", "sh", or "ee".
  # The mantle components are marked off with parentheses "()".
  # 
  # Optionally, if "omods" is 1:
  #   * any "o" element that is immediately preceded by a core 
  #     element and immediately followed by a core or mantle 
  #     element is considered part of the core;
  #   * any "o" element that is immediately preceded by a 
  #     mantle element and immediately followed by a 
  #     core or mantle element is considered part of the mantle.
  # 
  # The "crust" elements are what remains of the word after the 
  # removing the mantle and core components. They are not marked.
  # 
  # The presence of [ic] pre-modifiers and/or [ch] post-modifiers in an
  # element does not affect its classification.
}

(abort >= 0) { exit abort; }

/^ *$/{next;}
/./{ 
  
  # Delete dummy (empty) factors:

  gsub(/{[_]*}/, "", $0);
  gsub(/[_][_]*/, "", $0);
  
  # Map "ch", "sh", and "ee" elements to "C"/"S"/"E" to simplify processing:
  
  gsub(/ch/, "C", $0);
  gsub(/sh/, "S", $0);
  $0 = gensub(/([{][ic]*)ee/, "\\1E", "g", $0);
  
  # Split and bracket core+mantle segments:
  y = ""; x = $0;
  while (x != "") 
    { 
      ok = 0;
      # split off crust prefix, if any:
      if (match(x, /^[{][^CSEktpf]*[}]/))
        { # printf "c %d %d %s : %s\n", RSTART, RLENGTH, y, x;
          y = ( y substr(x,1,RLENGTH));
          x = substr(x,RLENGTH+1);
          ok = 1;
        }
      # split off mantle/core chunk, if any:
      if (match_core_mantle_element(x))
        { # printf "m %d %d %s : %s\n", RSTART, RLENGTH, y, x;
          y = ( y "(" substr(x,1,RLENGTH) );
          x = substr(x,RLENGTH+1);
          ok = 1;
          while (match_core_mantle_element(x))
            { # printf "+ %d %d %s : %s\n", RSTART, RLENGTH, y, x;
              y = ( y substr(x,1,RLENGTH) );
              x = substr(x,RLENGTH+1);
            }
          y = ( y ")" );
        }
      # printf "%d %d %s\n", RSTART, RLENGTH, x > "/dev/stderr";
      if (! ok) { error(("bad word \"" $0 "\", got stuck with \"" x "\"")); }
    }
  # mark off any core elements in "y":
  y = gensub(/([{][ic]*[ktpf][he]*[}])/, "<\\1>", "g", y);
  if (omods)
    { # Incorporate into the mantle any "o" elements that should go there:
      y = gensub(/[)]([{][o][e]*[}])[(]/, "\\1", "g", y);
      # Incorporate into the core any "o" elements that should go there:
      y = gensub(/[>]([{][o][e]*[}])/, "\\1>", "g", y);
    }
  
  # Restore mantle letters:  
  gsub(/C/, "ch", y);
  gsub(/S/, "sh", y);
  gsub(/E/, "ee", y);
  
  print y;
}

function match_core_mantle_element(x)
{
  match(x, /^[{][ic]*[CSEktpf][eh]*[}]/);
  return RSTART;
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}