#! /bin/csh -f
# Last edited on 2002-01-04 10:37:03 by stolfi

set usage = "$0 < INPUT.txt"

# Analyzes a Vietnamese VIQR plaintext, writes out the following files:
# 
#   .all.wds             All valid VIQR words in the text, including hypenated ones.
#   .uppercase.wds       Subset of .all.wds that contains uppercase letters.
#   .lowercase.wds       Complement of .uppercase.wds.
#   .capitalized-lc.wds  Words from .lowercase.wds, with initial capitalized.
#   .dualcase.wds        Words that occur in both lower case and initial-cap.
#   .uc-only.wds         Words that occur only in initial-cap (or funny-caps).

cat \
  | egrep -v '^[#@]' \
  | identify-viqr-words \
      -v valid=1 -v punctuation=0 -v invalid=0 \
  | sort | uniq \
  > .all.wds
  
cat .all.wds \
  | egrep '[A-Z]' \
  | sort | uniq \
  > .uppercase.wds
  
cat .all.wds \
  | egrep -v '[A-Z]' \
  | sort | uniq \
  > .lowercase.wds
  
cat .lowercase.wds \
  | gawk \
      ' /./{ \
          w=$0; \
          if (match(w,/^([a-ce-z]|[d][d])/)) \
            { a = substr(w,1,RLENGTH); \
              b = substr(w,RLENGTH+1); \
              w= (toupper(a) b); \
            } \
          print w; \
        } ' \
  | sort | uniq \
  > .capitalized-lc.wds
  
bool 1.2 .uppercase.wds .capitalized-lc.wds \
  > .dualcase.wds
 
bool 1-2 .uppercase.wds .capitalized-lc.wds \
  > .uc-only.wds