#! /bin/gawk -f
# Last edited on 2002-01-04 10:46:29 by stolfi

BEGIN{
  abort = -1;
  usage = ( \
    "cat INFILE.txt \\\n" \
    "  | identify-viqr-words \\\n" \
    "     [ -v valid=BOOL ] [ -v punctuation=BOOL ] [ -v invalid=BOOL ] \\\n" \
    "     [ -v tagged=BOOL ] \\\n" \
    "  > OUTFILE.wds" \
  );

  # Splits the input file into a sequence of vietnamese words, punctuation,
  # and other intervening strings, and outputs them one per line. Treats blanks
  # and newlines as word separators.
  # 
  # If "valid" is true, outputs the valid VIQR words, if "punctuation"
  # is true outputs the punctuation signs, if "invalid" is true outputs
  # the invalid strings.
  # 
  # If "tagged" is true, prepends "+" to the valid words, "." to
  # punctuation, "-" to the invalid strings.

  pat_vowel = "^(([Aa][\\(\\^]?)|([Oo][\\+\\^]?)|([Uu][\\+]?)|([Ee][\\^]?)|[IiYy])[`'.\\?~]?";
  pat_cons = "^[b-df-hj-np-tvwxzB-DF-HJ-NP-TVWXZ]";
  pat_punct = "^([-][-]|[.][.][.]|[!\\?:;,.()«»])";
  if (valid == "") { valid = 0; }
  if (punctuation == "") { punctuation = 0; }
  if (invalid == "") { invalid = 0; }
  if (tagged == "") { tagged = 0; }
  
  nlin = 0;         # Non-blank lines read.
  split("", count); # Parsed words, accepted or rejected; indexed by tag.
}

/^[ ]*([#]|$)/ { next; }

//{
  lin = ($0 " ");
  nlin++;
  gsub(/^[ \011]+/, "", lin);
  gsub(/[ \011][ \011]+/, " ", lin);
  nw = split(lin, wds);
  for(i = 1; i <= nw; i++)
    { wd = wds[i]; 
      while (wd != "")
        { if (match(wd, pat_vowel) || match(wd, pat_cons))
            { tag = "+"; ok = 1; res = ""; out = valid;
              while (ok && (wd != ""))
                { if (match(wd, pat_vowel) || match(wd, pat_cons))
                    { if (RLENGTH == 0) { prog_error("RLENGTH == 0"); }
                      res = (res substr(wd,1,RLENGTH)); wd = substr(wd, RLENGTH+1); 
                    }
                  else if (match(wd, /^[-][A-Za-z]/))
                    { res = (res "-"); wd = substr(wd, 2); }
                  else
                    { ok = 0; }
                }
            }
          else if (match(wd, pat_punct))
              { tag = "."; out = punctuation;
              if (RLENGTH == 0) { prog_error("RLENGTH == 0"); }
              res = substr(wd,1,RLENGTH); wd = substr(wd, RLENGTH+1);
            }
          else 
            { tag = "-"; ok = 1; res = ""; out = invalid; 
              while (ok && (wd != ""))
                { if (! (match(wd, pat_vowel) || match(wd, pat_cons) || match(wd, pat_punct)))
                    { res = (res substr(wd,1,1)); wd = substr(wd, 2); }
                  else
                    { ok = 0; }
                }
            }
          if (res == "") { prog_error(("res empty for word \"" wds[i] "\"")); }
          if (out) { if (tagged) { print tag, res; } else { print res; } }
          count[tag]++;
        }
    }
}

END{
  if (abort >= 0) { exit abort; }
  printf "non-blank lines %7d\n", nlin > "/dev/stderr";
  for (tag in count)
    { printf "%s %7d\n", tag, count[tag] > "/dev/stderr"; }
}

function prog_error(msg)
{
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}