# Last edited on 2000-05-31 01:19:07 by stolfi

function factor_text(x,eelump,chshsmash,ktsmash,hicsmash,hicsplit,esplit,   y,e)
{ 
  # 
  # Decomposes the EVA text "x" into its QOKOKOKOK elements, which are
  # delimited by "{}". Any empty elements are represented by "{_}".
  # Assumes "x" is uncapitalized EVA. EVA spaces and "/" are allowed.
  # 
  # If "eelump" is set, the group "ee" is treated as single letter
  # similar to "ch" and "sh", independently of "esplit". (Any "eee"
  # groups are parsed as "ee" followed by "e".)
  # 
  # If "chshsmash" is set, the combination "sh" is mapped to "ch".
  # (If "eelump" and "chshsmash" are both set, "ee" too is mapped to "ch").
  # 
  # If "ktsmash" is set, the letter "t" is mapped to "k",
  # and "f" to "p".
  #
  # If "hicsmash" is set, the letters [hc] except in "ch" and "sh",
  # and the letter "i" before a gallows, are converted to "e".
  #
  # If "hicsplit" is set, the letters [hic] are treated as independent
  # K elements (except for "i" before [dlrsxvmngj], and the digraphs
  # "ch" and "sh"). In particular, platform gallows, like "cth", are
  # split into three elements, like "{c}_{t}_{h}". Otherwise the
  # letters [ic] are parsed as element pre-modifiers and "h" as a
  # post-modifier, whenever possible.
  # 
  # If "esplit" is set, the letter "e" is treated as a separate K
  # element, otherwise it is treated as an element post-modifier
  # whenever possible.  (However, lumped "ee"s remain lumped.)

  # Delete non-significant comments:
  gsub(/{[^{}]*}/, "", x);
  gsub(/[! ]/, "", x);
  
  # Check for invalid letters:
  if (match(x, /[^-=\/,. *?%a-z]/)) { error(("invalid char in word \"" x "\"")); }
  
  # Map "sh" "ch" to single letters to simplify the parsing.
  gsub(/ch/, "C", x);
  gsub(/sh/, "S", x);
  if (eelump) { gsub(/ee/, "E", x); }

  # Map [hic] to "e" if the user asked for it:
  if (hicsmash)
    { 
      $0 = gensub(/i([ktpf])/, "e\\1", "g", $0);
      $0 = gensub(/c/, "e", "g", $0);
      $0 = gensub(/h/, "e", "g", $0);
    }
  
  # Map [tf] to [kp] if the user asked for it:
  if (ktsmash)
    { 
      gsub(/[t]/, "k", $0);
      gsub(/[f]/, "p", $0);
    }

  # Map "sh" (and lumped "ee") to "ch" if the user asked for it:
  if (chshsmash)
    { 
      gsub(/[SE]/, "C", $0);
    }

  # Main loop - consumes words from "x" and appends results to "y".
  y = ""; 

  while (x != "")
    { # printf "x = [%s]\n", x > "/dev/stderr";
      # copy punctuation if any:
      if (match(x, /^[-=\/,. ]+/))
        { 
          e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
          y = ( y e );
        }
      else
        { # split off initial <q> if any:
          if (match(x, /^[q]/)) 
            { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); }
          else
            { e = "_"; }
          y = ( y "{" e "}");

          # Secondary loop - splits elements from "x", appends them to "y"
          while (1)
            { # split off "[aoy]" group with eventual [ci] prefix and [he] suffix
              if (match_O(x,hicsplit,esplit))
                { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); }
              else
                { e = "_"; }
              y = ( y "{" e "}" );

              # copy next main letter with [ci] prefix and [he] suffix
              if (match_K(x,hicsplit,esplit))
                { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); }
              else
                { break; }
              y = ( y "{" e "}" );
            }
        }
    }

  # Unfold letter folding:

  gsub(/C/, "ch", y);
  gsub(/S/, "sh", y);
  gsub(/E/, "ee", y);

  return y;
}

function match_O(x,hicsplit,esplit)
{
  if (esplit && hicsplit)
    { match(x, /^[aoy]/); }
  else if (hicsplit)
    { match(x, /^[aoy][e]*/); }
  else if (esplit)
    { if (! match(x, /^[i][aoy][h][h]*/))
        { match(x, /^[c]*[aoy][h]*/); }
    }
  else
    { if (! match(x, /^[i][aoy][h][h]*/))
        { match(x, /^[c]*[aoy][h]*[e]*/); }
    }
  return(RSTART);
}

function match_K(x,hicsplit,esplit)
{
  if (! match(x, /^[i]*[dlrsxvnmgj]/))
    { if (esplit && hicsplit) 
        { match(x, /^[^-=\/,. %!aoy]/); }
      else if (hicsplit) 
        { match(x, /^[^-=\/,. %!aoy][e]*/);
        }
      else if (esplit) 
        { if (! match(x, /^[i][^-=\/,. %!aoy][h][h]*/))
            { match(x, /^[c]*[^-=\/,. %!aoy][h]*/); }
        }
      else
        {  if (! match(x, /^[i][^-=\/,. %!aoy][h][h]*[e]*/))
            { match(x, /^[c]*[^-=\/,. %!aoy][h]*[e]*/); }
        }
    }
  return(RSTART);
}