#! /bin/gawk -f # Last edited on 2002-01-04 10:46:29 by stolfi BEGIN{ abort = -1; usage = ( \ "cat INFILE.txt \\\n" \ " | identify-viqr-words \\\n" \ " [ -v valid=BOOL ] [ -v punctuation=BOOL ] [ -v invalid=BOOL ] \\\n" \ " [ -v tagged=BOOL ] \\\n" \ " > OUTFILE.wds" \ ); # Splits the input file into a sequence of vietnamese words, punctuation, # and other intervening strings, and outputs them one per line. Treats blanks # and newlines as word separators. # # If "valid" is true, outputs the valid VIQR words, if "punctuation" # is true outputs the punctuation signs, if "invalid" is true outputs # the invalid strings. # # If "tagged" is true, prepends "+" to the valid words, "." to # punctuation, "-" to the invalid strings. pat_vowel = "^(([Aa][\\(\\^]?)|([Oo][\\+\\^]?)|([Uu][\\+]?)|([Ee][\\^]?)|[IiYy])[`'.\\?~]?"; pat_cons = "^[b-df-hj-np-tvwxzB-DF-HJ-NP-TVWXZ]"; pat_punct = "^([-][-]|[.][.][.]|[!\\?:;,.()«»])"; if (valid == "") { valid = 0; } if (punctuation == "") { punctuation = 0; } if (invalid == "") { invalid = 0; } if (tagged == "") { tagged = 0; } nlin = 0; # Non-blank lines read. split("", count); # Parsed words, accepted or rejected; indexed by tag. } /^[ ]*([#]|$)/ { next; } //{ lin = ($0 " "); nlin++; gsub(/^[ \011]+/, "", lin); gsub(/[ \011][ \011]+/, " ", lin); nw = split(lin, wds); for(i = 1; i <= nw; i++) { wd = wds[i]; while (wd != "") { if (match(wd, pat_vowel) || match(wd, pat_cons)) { tag = "+"; ok = 1; res = ""; out = valid; while (ok && (wd != "")) { if (match(wd, pat_vowel) || match(wd, pat_cons)) { if (RLENGTH == 0) { prog_error("RLENGTH == 0"); } res = (res substr(wd,1,RLENGTH)); wd = substr(wd, RLENGTH+1); } else if (match(wd, /^[-][A-Za-z]/)) { res = (res "-"); wd = substr(wd, 2); } else { ok = 0; } } } else if (match(wd, pat_punct)) { tag = "."; out = punctuation; if (RLENGTH == 0) { prog_error("RLENGTH == 0"); } res = substr(wd,1,RLENGTH); wd = substr(wd, RLENGTH+1); } else { tag = "-"; ok = 1; res = ""; out = invalid; while (ok && (wd != "")) { if (! (match(wd, pat_vowel) || match(wd, pat_cons) || match(wd, pat_punct))) { res = (res substr(wd,1,1)); wd = substr(wd, 2); } else { ok = 0; } } } if (res == "") { prog_error(("res empty for word \"" wds[i] "\"")); } if (out) { if (tagged) { print tag, res; } else { print res; } } count[tag]++; } } } END{ if (abort >= 0) { exit abort; } printf "non-blank lines %7d\n", nlin > "/dev/stderr"; for (tag in count) { printf "%s %7d\n", tag, count[tag] > "/dev/stderr"; } } function prog_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }