# Last edited on 2002-01-16 18:10:54 by stolfi

# Factors a text by placing "{}" around each OKOKOKO element,
# together with its "I" and "E" complements.
# Assumes that the field is in EVA. Ligature-capitalization is 
# ignored on input, and added to the output. 
# To be included in factor-field-general etc.

function factor_text(x,   y,e)
{
  # Decomposes "x" into its OKOKOKO elements. 
  # Assumes "x" has no comments or fillers.
  # EVA spaces and "/" are allowed.  Ouput has ligature capitalizations and 
  # braces around each element.

  # Remove ligature capitalization, to simplify patterns:
  x = uncapitalize_ligatures(x);
  gsub(/{[^{}]*}/, "", x);
  gsub(/[!]/, "", x);
  if (match(x, /[^-=\/,. *?%a-z]/)) { data_error(("invalid char in word \"" x "\"")); }
  
  # Map "sh", "ch", and "ee" to single letters to simplify the parsing.
  # Note that "eee" groups are paired off from the left end. 
  gsub(/ch/, "C", x);
  gsub(/sh/, "S", x);
  gsub(/ee/, "E", x);

  # Map platformed and half-platformed letters to capitals to simplify the parsing:
  gsub(/ckh/, "K", x);
  gsub(/cth/, "T", x);
  gsub(/cfh/, "F", x);
  gsub(/cph/, "P", x);
  #             
  gsub(/ikh/, "G", x);
  gsub(/ith/, "H", x);
  gsub(/ifh/, "M", x);
  gsub(/iph/, "N", x);
  #
  gsub(/ck/, "U", x);
  gsub(/ct/, "V", x);
  gsub(/cf/, "X", x);
  gsub(/cp/, "Y", x);
  
  y = ""; 
  
  while (x != "")
    { # printf "x = [%s]\n", x > "/dev/stderr";
      if (match(x, /^[-=\/,. ]+/))
        { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
          y = ( y e );
        }
      else
        { # split off initial <q> if any:
          if (match(x, /^[q]/)) 
            { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
              y = ( y "{" e "}");
            }

          while (1)
            { if (match(x, /^[aoy]/))
                { # split off "[aoy]" group
                  e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
                }
              else if (match(x, /^([i]+[dlrsxmgn]?|[^-,./=aoyehi][eh]?|[^-,./=aoy])/))
                { # copy next main letter with "i" and "e" complements
                  e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); }
              else
                { break; }
              y = ( y "{" e "}");
            }
        }
    }
  # Unfold letter folding:
  gsub(/U/, "ck", y);
  gsub(/V/, "ct", y);
  gsub(/X/, "cf", y);
  gsub(/Y/, "cp", y);
  #
  gsub(/G/, "ikh", y);
  gsub(/H/, "ith", y);
  gsub(/M/, "ifh", y);
  gsub(/N/, "iph", y);
  #
  gsub(/K/, "ckh", y);
  gsub(/T/, "cth", y);
  gsub(/P/, "cph", y);
  gsub(/F/, "cfh", y);
  #
  gsub(/C/, "ch", y);
  gsub(/S/, "sh", y);
  gsub(/E/, "ee", y);

  # Now add ligatures:
  y = capitalize_ligatures(y);
  return y;
}

function uncapitalize_ligatures(w)
{
  # Removes ligature-capitalization:
  gsub(/C/, "c", w);
  gsub(/S/, "s", w);
  gsub(/I/, "i", w);
  gsub(/H/, "h", w);
  gsub(/K/, "k", w);
  gsub(/T/, "t", w);
  gsub(/P/, "p", w);
  gsub(/F/, "f", w);
  gsub(/Y/, "y", w);
  gsub(/O/, "o", w);
  gsub(/A/, "a", w);
  return w;
}

function capitalize_ligatures(w)
{
  # Capitalize ligatures:
  gsub(/ch/, "Ch", w);
  gsub(/sh/, "Sh", w);
  gsub(/ckh/, "CKh", w); gsub(/ikh/, "IKh", w); gsub(/[?]kh/, "?Kh", w);
  gsub(/cth/, "CTh", w); gsub(/ith/, "ITh", w); gsub(/[?]th/, "?Th", w);
  gsub(/cph/, "CPh", w); gsub(/iph/, "IPh", w); gsub(/[?]ph/, "?Ph", w);
  gsub(/cfh/, "CFh", w); gsub(/ifh/, "IFh", w); gsub(/[?]fh/, "?Fh", w);
  gsub(/c[?]h/, "C?h", w); 
  gsub(/i[?]h/, "I?h", w);
  gsub(/c[?]/, "C?", w);
  gsub(/hh/, "Hh", w);
  return w;
}