#! /usr/bin/gawk -f
# Last edited on 2025-05-03 06:09:30 by stolfi
# Reads EVMT text from standard input, selects text units of given type(s).
#
# cat INFILE \
# | ./select_units,gawk \
# -v types='TYPE1,TYPE2,...,TYPEN' \
# -v table=TBLFILE \
# > OUTFILE
#
# Each line of TBLFILE should have two words UNIT TYPE specifying
# the text type for each text unit that may occur in the INFILE
# === ACTIONS ===================================================
BEGIN {
abort = -1;
# Parse "types" option, set up "good[t] = 1" for each sleected type "t".
if (types == "") arg_error("must specify \"-v types='TYPE1,TYPE2,...,TYPEN'\"\n");
n = split(types,tp,/[ ,]+/);
split("", good);
for (i=1;i<=n;i++) { good[tp[i]] = 1; }
# Read the unit-to-type table, "utype[u]"
if (table == "") arg_error("must specify \"-v table=FILE\"\n");
# printf "table = %s\n", table > "/dev/stderr"
split("", utype);
nMap=0;
ERRNO = ""
while((getline lin < table) > 0) {
if (! match(lin, /^[#]/))
{ nfld = split(lin, fld, " ");
if (nfld != 2) arg_error(("bad table entry = \"" lin "\""));
if (fld[1] in dic) arg_error(("repeated key = \"" lin "\""));
utype[fld[1]] = fld[2];
nMap++;
}
}
if (ERRNO != "") { arg_error((table ": " ERRNO)); }
close (table);
if (nMap == 0) { arg_error(("file \"" table "\" empty or missing")); }
printf " loaded %6d table pairs\n", nMap > "/dev/stderr"
}
(abort >= 0) { exit abort; }
/^#/ { next; }
/^<[^<>;]*> *[{][^{}]*[}] *$/{ print; next; }
/^</ {
if (! match($0, /^<[^<>]*[;][A-Z]>/)) { error("bad locator format"); }
un = substr($0, 2, RLENGTH-4);
# printf " un = %s\n", un > "/dev/stderr"
ustrip = un; gsub(/[.][0-9]+[a-z]?$/, "", ustrip);
# printf " ustrip = %s\n", ustrip > "/dev/stderr"
if (! (ustrip in utype)) { error(("unit \"" ustrip "\" not in table")); }
if (good[utype[ustrip]]) { print; }
next;
}
/./ { error("bad line format"); }
function arg_error(msg)
{
printf "%s\n", msg >> "/dev/stderr";
abort = 1;
exit 1
}
function error(msg)
{
printf "line %d: %s\n", NR, msg >> "/dev/stderr";
abort = 1;
exit 1
}