#! /bin/csh -f # Last edited on 2002-01-04 10:37:03 by stolfi set usage = "$0 < INPUT.txt" # Analyzes a Vietnamese VIQR plaintext, writes out the following files: # # .all.wds All valid VIQR words in the text, including hypenated ones. # .uppercase.wds Subset of .all.wds that contains uppercase letters. # .lowercase.wds Complement of .uppercase.wds. # .capitalized-lc.wds Words from .lowercase.wds, with initial capitalized. # .dualcase.wds Words that occur in both lower case and initial-cap. # .uc-only.wds Words that occur only in initial-cap (or funny-caps). cat \ | egrep -v '^[#@]' \ | identify-viqr-words \ -v valid=1 -v punctuation=0 -v invalid=0 \ | sort | uniq \ > .all.wds cat .all.wds \ | egrep '[A-Z]' \ | sort | uniq \ > .uppercase.wds cat .all.wds \ | egrep -v '[A-Z]' \ | sort | uniq \ > .lowercase.wds cat .lowercase.wds \ | gawk \ ' /./{ \ w=$0; \ if (match(w,/^([a-ce-z]|[d][d])/)) \ { a = substr(w,1,RLENGTH); \ b = substr(w,RLENGTH+1); \ w= (toupper(a) b); \ } \ print w; \ } ' \ | sort | uniq \ > .capitalized-lc.wds bool 1.2 .uppercase.wds .capitalized-lc.wds \ > .dualcase.wds bool 1-2 .uppercase.wds .capitalized-lc.wds \ > .uc-only.wds