#! /usr/bin/gawk -f 
# Last edited on 2000-05-30 18:47:13 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] " -f get-component.gawk \\\n" \
    "  -v select=CTAGS \\\n" \
    "  < INFILE.fcp \\\n" \
    "  > OUTFILE.pairs" \
  );
  
  # Reads a file of words factored into QOKOKOKO elements, with their
  # crust/mantle/core structure bracketed with "()<>", (see the script
  # "split-words"). Outputs a list of components selected by the
  # component tags CTAGS. See pack_components for details.
  #
  # The following CTAGS values are treated especially.
  # They ouput zero or more pairs X-Y for each word, where
  # X and Y are given by the folowing table:
  #
  #   tag      X                    Y
  #   -------  -------------------  --------------------
  #   tc-y:    type of component    coarse component.
  #   tf-z:    type of component    fine component.
  #   tw-w:    type of word         the word.
  #   k-w:     number of "peaks"    the word.
  #
  
  if (select == "") { error("must specify \"select\""); }
  
}

(abort >= 0 ) { exit abort; }

/^ *$/{next;}

/./{ 
   if (select == "k-w") 
    { d = pack_npeaks_word($0); }
  else if (select == "tc-y") 
    { d = pack_type_and_coarse_comps($0); }
  else if (select == "tf-z") 
    { d = pack_type_and_fine_comps($0); }
  else if (select == "tw-w") 
    { d = pack_type_word($0); }
  else
    { d = pack_components($0,select, "."); }
    
  if (d != "") { print d; }
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1;
}

function pack_components(w,tag,ety,     y,t)
{ 
  # 
  # Assumes "w" is a simple word. Returns the component(s) of "w"
  # selected by the given "tag". Each letter of "tag" selects one
  # component, according to the following table:
  #
  #   p = crust prefix.
  #   m = mantle prefix
  #   c = core
  #   n = mantle suffix
  #   s = crust suffix
  # 
  # The components selected by successive letters of the "tag" are
  # concatenated without separators in the output word. The "<>" and
  # "()" delmiters are removed, but braces (if present) are retained.
  # Any "tag" character that is not one of the above is simply copied
  # to the output. If the result is the empty string, "ety" is
  # returned instead.
  # 
  # If a requested element is missing or empty in "w", it is simply
  # omitted from the output string. On the other hand, if the
  # request is ambiguous, the procedure aborts and returns
  # the empty string.
  # 
  # In particular, if "w" has no core, the codes "m" and "n" select
  # the entire mantle only if they occur next to each other in "tag",
  # or separated by "c". Any other occurrences of "m" or "n" will
  # cause the procedure to abort.
  # 
  # Analogously, if "w" has neither core nor mantle, the codes "p" and
  # "s" will select the entire word if they occur next to each other,
  # or separated by any of "m", "n", or "c"; otherwise the procedure will abort.
  
  get_components(w);
  # debug_components(w);
  y = "";
  while(tag != "")
    { t = pick_unambiguous_tag_group(tag, maxlevel);
      tag = substr(tag,length(t)+1);
      if (t == "") { return(""); }
      t = reduce_tag_group(t,maxlevel);
      # printf "  tag group = %s ...", t;
      if (t == "")
        { y = y; }
      else if (t == "p")
        { y = ( y precrust ); }
      else if (t == "ps")
        { y = ( y crust ); }
      else if (t == "m")
        { y = ( y premantle); }
      else if (t == "mn")
        { y = ( y mantle ); }
      else if (t == "c")
        { y = ( y core ); }
      else if (t == "n")
        { y = ( y sufmantle ); }
      else if (t == "s")
        { y = ( y sufcrust ); }
      else 
        { y = ( y t ); }
      # printf "  y = [%s]\n", y;
    }
  return((y == "" ? ety : y));
}

function pick_unambiguous_tag_group(tag,maxlevel)
{ 
  # Returns the shortest non-empty prefix of "tag" that describes a
  # set of components that can be unambiguously identified in any word
  # with given "maxlevel". If there is no such prefix, returns "".
  # 
  # Thus, for example, "p" and "s" can be unambiguously identified if
  # "maxlevel != 1"; otherwise, one can only identify the
  # concatenation "p[mcn]*s". Likewise, "m" and "n" can be
  # unambiguously identified only if "maxlevel != 2", otherwise one
  # can only identify the concatenation "mc*s".
  
  if (tag == "") 
    { return(""); }
  else if (maxlevel == 1) 
    { if (match(tag, /^([^ps])|([p][mcn]*[s])/))
        { return(substr(tag,1,RLENGTH)); }
      else
        { return(""); }
    }
  else if (maxlevel == 2)
    { if (match(tag, /^([^mn])|([m][c]*[n])/))
        { return(substr(tag,1,RLENGTH)); }
      else
        { return(""); }
    }
  else 
    { return(substr(tag,1,1)); }
}
  
function reduce_tag_group(t,maxlevel,  n)
{
  # Given a minimal non-ambiguous tag group for the specified
  # maxlevel, reduces the group to the tags of non-empty 
  # components.
  if (maxlevel < 3) { gsub(/[c]/, "", t); }
  if (maxlevel < 2) { gsub(/[mn]/, "", t); }
  if (maxlevel < 1) { gsub(/[ps]/, "", t); }
  return(t);
}

function pack_type_and_coarse_comps(w,  y)
{
  # Splits "w" into 
  
  get_components(w);
  # debug_components(w);
  if (maxlevel == 0) 
    { y = "c-\npm-\nns-"; }
  else if (maxlevel == 1) 
    { y = sprintf("c-\npmns-%s", crust); }
  else if (maxlevel == 2) 
    { y = sprintf("c-\npmns-%s%s%s", precrust,mantle,sufcrust); }
  else if (maxlevel == 3) 
    { y = sprintf("c-%s\npm-%s%s\nns-%s%s",
        core, precrust, premantle, sufmantle, sufcrust);
    }
  else
    { error(("bad maxlevel = " maxlevel)); }
  return(y);
}

function pack_type_and_fine_comps(w,  y)
{
  get_components(w);
  # debug_components(w);
  if (maxlevel == 0) 
    { y = "c-\nm-\nn-\np-\ns-\n"; }
  else if (maxlevel == 1) 
    { y = sprintf("m-\nn-\nc-\nps-%s", crust); }
  else if (maxlevel == 2) 
    { y = sprintf("c-\nmn-%s\np-%s\ns-%s", mantle,precrust,sufcrust); }
  else if (maxlevel == 3) 
    { y = sprintf("c-%s\nm-%s\nn-%s\np-%s\ns-%s",
        core, premantle, sufmantle, precrust, sufcrust);
    }
  else
    { error(("bad maxlevel = " maxlevel)); }
  return(y);
}

function pack_type_word(w,  y)
{
  get_components(w);
  # debug_components(w);
  if (maxlevel == 0) 
    { y = ""; }
  else if (maxlevel == 1) 
    { y = (crust =="" ? "" : "ps"); }
  else if (maxlevel == 2) 
    { y = ( \
        (precrust == "" ? "" : "p") \
        "mn" \
        (sufcrust == "" ? "" : "s") \
      );
    }
  else if (maxlevel == 3) 
    { y = ( \
        (precrust == "" ? "" : "p") \
        (premantle == "" ? "" : "m") \
        "c" \
        (sufmantle == "" ? "" : "n") \
        (sufcrust == "" ? "" : "s") \
      );
    }
  else
    { error(("bad maxlevel = " maxlevel)); }
  return ((y "-" w));
}

function pack_npeaks_word(w,  t)
{
  # printf "%s\n", w;
  # Count local minima of component bracketing:
  t = w;
  gsub(/[)>][^<>()]*[<(]/, "@", t);
  gsub(/[^@]/, "", t);
  return ((length(t)+1) "-" w);
}

function debug_components(w)
{
  if (maxlevel == 0) 
    { printf "%s = \n", w; }
  else if (maxlevel == 1) 
    { printf "%s = {%s}\n", w, crust; }
  else if (maxlevel == 2) 
    { printf "%s = {%s{%s}%s}\n", w, precrust,mantle,sufcrust; }
  else if (maxlevel == 3) 
    { printf "%s = {%s{%s{%s}%s}%s}\n", w, \
        precrust,premantle,core,sufmantle,sufcrust;
    }
  else
    { error(("bad maxlevel = " maxlevel)); }
}