#! /bin/csh -f
# Last edited on 2002-01-04 10:37:03 by stolfi
set usage = "$0 < INPUT.txt"
# Analyzes a Vietnamese VIQR plaintext, writes out the following files:
#
# .all.wds All valid VIQR words in the text, including hypenated ones.
# .uppercase.wds Subset of .all.wds that contains uppercase letters.
# .lowercase.wds Complement of .uppercase.wds.
# .capitalized-lc.wds Words from .lowercase.wds, with initial capitalized.
# .dualcase.wds Words that occur in both lower case and initial-cap.
# .uc-only.wds Words that occur only in initial-cap (or funny-caps).
cat \
| egrep -v '^[#@]' \
| identify-viqr-words \
-v valid=1 -v punctuation=0 -v invalid=0 \
| sort | uniq \
> .all.wds
cat .all.wds \
| egrep '[A-Z]' \
| sort | uniq \
> .uppercase.wds
cat .all.wds \
| egrep -v '[A-Z]' \
| sort | uniq \
> .lowercase.wds
cat .lowercase.wds \
| gawk \
' /./{ \
w=$0; \
if (match(w,/^([a-ce-z]|[d][d])/)) \
{ a = substr(w,1,RLENGTH); \
b = substr(w,RLENGTH+1); \
w= (toupper(a) b); \
} \
print w; \
} ' \
| sort | uniq \
> .capitalized-lc.wds
bool 1.2 .uppercase.wds .capitalized-lc.wds \
> .dualcase.wds
bool 1-2 .uppercase.wds .capitalized-lc.wds \
> .uc-only.wds