#! /usr/bin/gawk -f # Last edited on 2025-05-09 09:27:36 by stolfi BEGIN { usage = ( \ "cat INFILE \\\n" \ " | map_field.gawk \\\n" \ " [ -v inField=FLDNUM ] \\\n" \ " [ -v outField=FLDNUM ] \\\n" \ " -v table=TBLFILE [ -v inverse=BOOL ] \\\n" \ " [ -v defSubst=DEFSTRING | -v forgiving=BOOL ] \\\n" \ " > OUTFILE " \ ); # Reads each line from stdin, maps a selected field through a table, # and inserts the result as a specified new field. Usage: # The {inField} and {outField} parameters are "1" if not specified, # i.e. the new field is prepended to the original record. # Each line of {TBLFILE} should have two blank-separated words "{OLD} # {NEW}" specifying the mapping from {OLD} field values to {NEW} field # values. If {inverse} is set, assumes that the order of fields in # table lines is "{NEW} {OLD}" instead. Ignores "#"-comments and empty # lines in the table file. # Whenever an input field is not found in the table, if {forgiving} is # set, leaves it alone; otherwise, the {defSubst} string is used if # not empty; otherwise the script aborts with an error message. # In any case, input lines that are blank or begin with "#" are not # changed. abort = -1; if (table == "") { arg_error("must specify \"-v table=FILE\"\n"); } if (inverse == "") { inverse = 0; } if (inField == "") { inField = 1; } if (outField == "") { outField = 1; } if ((forgiving != "") && (defSubst != "")) { arg_error("can't specify \"defSubst\" with \"forgiving\""); } else if ((forgiving == "") && (defSubst == "")) { forgiving = 0; } split("", dic); read_table(table,inverse,dic); } (abort >= 0) { exit abort; } /^[#]/ { print; next; } /^ *$/ { print; next; } /./ { if (abort >= 0) { exit abort; } if (NF < inField) { data_error("not enough input fields\n"); } x = $(inField); if (x in dic) { y = dic[x]; } else { if (forgiving) { y = x; } else if (defSubst != "") { y = defSubst; } else { data_error(("key \"" x "\" not in table\n")); } } printout(y, outField); next; } function read_table(fname,inv,tbl, ntbl,nlin,lin,fld,nfld,tmp) { ntbl=0; nlin=0; while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { nfld = split(lin, fld, " "); if ((nfld >= 3) && (fld[3] ~ /^[#]/)) { nfld = 2; } if (nfld != 2) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); } # If {inv} is true, swap the two columns: if (inv) { tmp = fld[1]; fld[1] = fld[2]; fld[2] = tmp; } if (fld[1] in tbl) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); } tbl[fld[1]] = fld[2]; ntbl++; } } if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } # printf "loaded %6d map pairs\n", ntbl > "/dev/stderr" } function printout(mw, fn, i) { # prints $0 with {mw} inserted as field {$(fn)} if (NF < fn-1) { data_error("not enough output fields\n"); } if (fn == 1) { print mw, $0; } else if (fn == NF+1) { print $0, mw; } else { for (i=1;i "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }