#! /usr/bin/gawk -f
# Last edited on 2001-01-02 04:10:49 by stolfi
BEGIN {
abort = -1;
usage = ( \
"factor-word-oko \\\n" \
" [ -v inField=NUM ] \\\n" \
" [ -v erase=BOOL ] \\\n" \
" [ -v outField=NUM ] \\\n" \
" < INFILE > OUTFILE" \
);
# Factors the "inField"th field of INFILE into its OKOKOKO elements.
# Assumes that the field is in EVA. Ligature-capitalization is
# ignored on input, and added to the output.
#
# If "erase" is set, the input field is erased, otherwise it is
# preserved. Then inserts the factored word as the "outField"th
# field.
#
# Each QOKOKOKO element with its "I" and "E" complements
# is wrapped with "{}".
#
if (inField == "") inField = 1;
if (erase == "") erase = 0;
if (outField == "") outField = inField;
}
(abort >= 0) { exit abort; }
/^#/ { print; next; }
/./ {
if (NF < inField) { data_error("not enough input fields"); }
x = $(inField);
x = uncapitalize_ligatures(x);
y = factor_text_oko(x);
y = capitalize_ligatures(y);
printout(y, outField, inField, erase);
next;
}
function factor_text_oko(x, y,e)
{
# Decomposes "x" into its OKOKOKO elements.
# Assumes "x" is uncapitalized EVA without comments and fillers.
# EVA spaces and "/" are allowed. Ouput is uncapitalized and
# has braces around each element.
gsub(/{[^{}]*}/, "", x);
gsub(/[!]/, "", x);
if (match(x, /[^-=\/,. *?%a-z]/)) { data_error(("invalid char in word \"" x "\"")); }
# Map "sh", "ch", and "ee" to single letters to simplify the parsing.
# Note that "eee" groups are paired off from the left end.
gsub(/ch/, "C", x);
gsub(/sh/, "S", x);
gsub(/ee/, "E", x);
# Map platformed and half-platformed letters to capitals to simplify the parsing:
gsub(/ckh/, "K", x);
gsub(/cth/, "T", x);
gsub(/cfh/, "F", x);
gsub(/cph/, "P", x);
#
gsub(/ikh/, "G", x);
gsub(/ith/, "H", x);
gsub(/ifh/, "M", x);
gsub(/iph/, "N", x);
#
gsub(/ck/, "U", x);
gsub(/ct/, "V", x);
gsub(/cf/, "X", x);
gsub(/cp/, "Y", x);
y = "";
while (x != "")
{ # printf "x = [%s]\n", x > "/dev/stderr";
if (match(x, /^[-=\/,. ]+/))
{ e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
y = ( y e );
}
else
{ # split off initial <q> if any:
if (match(x, /^[q]/))
{ e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
y = ( y "{" e "}");
}
while (1)
{ if (match(x, /^[aoy]/))
{ # split off "[aoy]" group
e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1);
}
else if (match(x, /^([i]+[dlrsxmgn]?|[^-,./=aoyehi][eh]?|[^-,./=aoy])/))
{ # copy next main letter with "i" and "e" complements
e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); }
else
{ break; }
y = ( y "{" e "}");
}
}
}
# Unfold letter folding:
gsub(/U/, "ck", y);
gsub(/V/, "ct", y);
gsub(/X/, "cf", y);
gsub(/Y/, "cp", y);
#
gsub(/G/, "ikh", y);
gsub(/H/, "ith", y);
gsub(/M/, "ifh", y);
gsub(/N/, "iph", y);
#
gsub(/K/, "ckh", y);
gsub(/T/, "cth", y);
gsub(/P/, "cph", y);
gsub(/F/, "cfh", y);
#
gsub(/C/, "ch", y);
gsub(/S/, "sh", y);
gsub(/E/, "ee", y);
return y;
}
function uncapitalize_ligatures(w)
{
# Removes ligature-capitalization:
gsub(/C/, "c", w);
gsub(/S/, "s", w);
gsub(/I/, "i", w);
gsub(/H/, "h", w);
gsub(/K/, "k", w);
gsub(/T/, "t", w);
gsub(/P/, "p", w);
gsub(/F/, "f", w);
gsub(/Y/, "y", w);
gsub(/O/, "o", w);
gsub(/A/, "a", w);
return w;
}
function capitalize_ligatures(w)
{
# Capitalize ligatures:
gsub(/ch/, "Ch", w);
gsub(/sh/, "Sh", w);
gsub(/ckh/, "CKh", w); gsub(/ikh/, "IKh", w); gsub(/[?]kh/, "?Kh", w);
gsub(/cth/, "CTh", w); gsub(/ith/, "ITh", w); gsub(/[?]th/, "?Th", w);
gsub(/cph/, "CPh", w); gsub(/iph/, "IPh", w); gsub(/[?]ph/, "?Ph", w);
gsub(/cfh/, "CFh", w); gsub(/ifh/, "IFh", w); gsub(/[?]fh/, "?Fh", w);
gsub(/c[?]h/, "C?h", w);
gsub(/i[?]h/, "I?h", w);
gsub(/c[?]/, "C?", w);
gsub(/hh/, "Hh", w);
return w;
}
function printout(mw, ofn, ifn, del, i)
{
# prints $0 with "mw" inserted as field "$(ofn)"
# if "del" is true, deletes field "$(ifn)"
if (del)
{ if (NF < ifn)
{ data_error("not enough input fields\n"); }
else
{ for(i=ifn; i<NF-1; i++) { $(i) = $(i+1); }
NF--;
}
}
if (NF < ofn-1) { data_error("not enough output fields\n"); }
if (ofn == 1)
{ print mw, $0; }
else if (ofn == NF+1)
{ print $0, mw; }
else
{ for (i=1;i<ofn;i++) { printf "%s%s", $(i), OFS; }
printf "%s", mw;
for (i=ofn;i<=NF;i++) { printf "%s%s", OFS, $(i); }
printf "\n";
}
}
function data_error(msg)
{
printf "line %d: %s\n", NR, msg >> "/dev/stderr";
abort = 1;
exit 1;
}
function arg_error(msg)
{
printf "%s\n", msg >> "/dev/stderr";
abort = 1;
exit 1;
}