#! /usr/bin/gawk -f
# Last edited on 2001-01-02 01:06:42 by stolfi

# Reads EVMT text from stadard input, selects text units of given type(s).
#
#    cat INFILE \
#      | select-units \
#          -v types='TYPE1,TYPE2,...,TYPEN' \
#          -v table=TBLFILE \
#      > OUTFILE
#
# Each line of TBLFILE should have two words UNIT TYPE specifying
# the text type for each text unit that may occur in the INFILE

# === ACTIONS ===================================================

BEGIN {
  abort = -1;
  
  # Parse "types" option, set up "good[t] = 1" for each sleected type "t".
  if (types == "") arg_error("must specify \"-v types='TYPE1,TYPE2,...,TYPEN'\"\n");
  n = split(types,tp,/[ ,]+/);
  split("", good);
  for (i=1;i<=n;i++) { good[tp[i]] = 1; }
  
  # Read the unit-to-type table, "utype[u]"
  if (table == "") arg_error("must specify \"-v table=FILE\"\n");
  split("", utype);
  nMap=0;
  while((getline lin < table) > 0) { 
    if (! match(lin, /^[#]/))
      { nfld = split(lin, fld, " ");
        if (nfld != 2) arg_error(("bad table entry = \"" lin "\""));
        if (fld[1] in dic) arg_error(("repeated key = \"" lin "\""));
        utype[fld[1]] = fld[2];
        nMap++;
      }
  }
  if (ERRNO != "0") { arg_error((table ": " ERRNO)); }
  close (table);
  if (nMap == 0) { arg_error(("file \"" table "\" empty or missing")); }
  # printf "loaded %6d table pairs\n", nMap > "/dev/stderr"
}

(abort >= 0) { exit abort; }

/^#/ { next; }

/^<[^<>;]*> *[{][^{}]*[}] *$/{ print; next; }

/^</ {
  if (! match($0, /^<[^<>]*[;][A-Z]>/)) { error("bad locator format"); }
  un = substr($0, 2, RLENGTH-4); 
  gsub(/[.][0-9]+[a-z]?$/, "", un);
  if (! (un in utype)) { error(("unit \"" un "\" not in table")); }
  if (good[utype[un]]) { print; }
  next;
}
      
/./ { error("bad line format"); }

function arg_error(msg)
{ 
  printf "%s\n", msg >> "/dev/stderr";
  abort = 1;
  exit 1
}

function error(msg)
{ 
  printf "line %d: %s\n", NR, msg >> "/dev/stderr";
  abort = 1;
  exit 1
}