# Last edited on 2012-05-05 19:46:18 by stolfilocal
# To be included in factor-field-general etc.
# Factors a Chinese pinyin text with disambiguating suffixes
# by placing "{}" around each letter, plus a single "{}"
# around the tone and disambiguation suffix.
# If the tone is 4 it is omitted. If the tone and suffix are
# omitted then the corresponding element is omitted too.
# Also parses the digraphs "yi" and "wu" as single letters.
function factor_text(x, y,e,ts,t,s)
{
# Extract tone and disambiguating suffix:
if (match(x, /[^0-9.][0-9.]*$/))
{ ts = substr(x, RSTART+1); x = substr(x, 1, RSTART);
if (match(ts, /[.]/))
{ t = substr(ts, 1,RSTART-1); s = substr(ts, RSTART+1); }
else
{ t = ts; s = ""; }
}
else
{ ts = ""; t = ""; s = ""; }
# Format checks:
if (t !~ /^[1-5]?$/)
{ data_error(("bad tone code \"" x ts "\"")); }
if (x !~ /^([eE][\^]?|[uU][:]?|[a-df-tv-zA-DF-TV-Z]|ü|Ü|ê|Ê)+$/)
{ data_error(("bad pinyin \"" x ts "\"")); }
# Omit tone 4 (the most common one):
if (t == "4") { t = ""; }
# Make each letter into one element,
# but beware of the letters "u:" and "e^"
y = gensub(/([yy][iI]|[wW][uU]|([a-zA-Z]|ü|Ü|ê|Ê)[:\^]?)/, "{\\1}", "g", x);
if ((t != "") || (s != "")) { y = ( y "{" ts "}" ); }
return y;
}