#! /usr/bin/gawk -f # Last edited on 2026-01-16 11:37:07 by stolfi # Reads EVMT text from standard input, selects text units of specified type(s). # # cat INFILE \ # | ./select_units,gawk \ # -v types='TYPE1,TYPE2,...,TYPEN' \ # -v table=TBLFILE \ # > OUTFILE # # The {TBLFILE} should have one line with two words "{UNIT} {TYPE}" for # each text {UNIT} that may occur in the {INFILE}. # # A {UNIT} is the old-style locator minus the line number and # transcriber code, like "f85v2.P2". The {TYPE} is a string of nonblanks # specifying the type of that unit, like "parags" or "labels", in some # arbitrary classification scheme. # === ACTIONS =================================================== BEGIN { abort = -1; # Parse "types" option, set up "good[t] = 1" for each sleected text type "t". if (types == "") arg_error("must specify \"-v types='TYPE1,TYPE2,...,TYPEN'\"\n"); n = split(types,tps,/[ ,]+/); split("", good); for (i=1;i<=n;i++) { good[tps[i]] = 1; } # Read the unit-to-type table, "utype[u]" if (table == "") arg_error("must specify \"-v table=FILE\"\n"); # printf "table = %s\n", table > "/dev/stderr" split("", utype); nMap=0; ERRNO = "" while((getline lin < table) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld, " "); if (nfld != 2) arg_error(("bad table entry = \"" lin "\"")); if (fld[1] in dic) arg_error(("repeated key = \"" lin "\"")); utype[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "") { arg_error((table ": " ERRNO)); } close (table); if (nMap == 0) { arg_error(("file \"" table "\" empty or missing")); } printf " loaded %6d table pairs\n", nMap > "/dev/stderr" } (abort >= 0) { exit abort; } /^#/ { next; } /^<[^<>;]*> *[{][^{}]*[}] *$/{ print; next; } /^]*[;][A-Z]>/)) { error("bad locus ID format"); } un = substr($0, 2, RLENGTH-4); # printf " un = %s\n", un > "/dev/stderr" ustrip = un; gsub(/[.][0-9]+[a-z]?$/, "", ustrip); # printf " ustrip = %s\n", ustrip > "/dev/stderr" if (! (ustrip in utype)) { error(("unit \"" ustrip "\" not in table")); } if (good[utype[ustrip]]) { print; } next; } /./ { error("bad line format"); } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1 } function error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1 }