#! /bin/sed -f
# Last edited on 1998-07-16 02:46:20 by stolfi
#
# Factors pinyin words with numeric tones into 
# its constituents C{V:T}F where 
#
#   C is a consonant (single or digraph) or '@' for none
#   V is a vowel group 
#   T is a numric tone or '0' for neutral
#   F is a final 'r' 'n' 'ng' or '@' for none.
# 
# Ignore comments:
/^[#]/b
#
# Provide a default "e" vowel for isolated "r":
s/^r$/er/
s/^r\([^a-z]\)/er\1/
s/\([^a-zü]\)r$/\1er/g
s/\([^a-zü]\)r\([^a-zü]\)/\1er\2/g
#
# Mark off the vowel group:
s/\([aeiouüyw][aeiouü]*\)/{\1}/g
#
# Provide '@' for empty consonant:
s/^{/@{/
s/\([^@a-z]\){/\1@{/g
#
# Provide '@' for empty final:
s/}$/}@/
s/}\([^@a-z]\)/}@\1/g
# 
# Unravel cryptic "ü"s
s/\([jqx]\){u/\1{ü/g
s/@{yu/@{ü/g
# 
# Normalize "y" "w"
s/@{yi/@{i/g
s/@{y/@{i/g
s/@{wu/@{u/g
s/@{w/@{u/g
#
# Unravel contracted "iu" "ui" "un"
s/\([^@]\){iu}/\1{iou}/g
s/\([^@]\){ui}/\1{uei}/g
s/\([^@]\){u}n/\1{ue}n/g
#
# Move the tone closer to the vowel group:
s/}\([nrg@]*\)\([0-4]\)/:\2}\1/g
#
# Provide a default zero tone:
s/\([^0-4]\)}/\1:0}/g
#