#! /usr/bin/gawk -f # Last edited on 2022-06-05 21:58:04 by stolfi BEGIN { usage = ( \ "cat INFILE \\\n" \ " | add_date_name_to_urls.gawk \\\n" \ " -v prefix=URLPREFIX \\\n" \ " -v table=TBLFILE \\\n" \ " > OUTFILE " \ ); # Reads an HTML file stdin that contains lines with format # "{SECURL} @ {COMMENT}" # where {SECURL} is the URL of a letter sent to the SEC, # and {COMMENT} is an arbitrary text. # Converts each of those lines to a HTML table line showing the date and sender name of the letter, # with a HTML link to the {SECURL}, and the {COMMENT}, suitably formatted. # Other lines are left unchanged. # Every {SECURL} in the file should start with the given {URLPREFIX}. # Each line of the {TBLFILE} should have three field {SECNUM} {DATE} # {NAME} where {SECNUM} is the URL of the letter minus the # {URLPREFIX}, {DATE} is the ISO date of the letter, and {NAME} is # the sender's name with blanks replaced by "_". Comments starting # with "#" and blank lines are ignored in {TBLFILE}. # Whenever an input {SECURL} is not found in the table, the # strings "???-??-??" and "???" are substituted for the date and name. # In any case, input lines that are blank or begin with "#" are not # changed. abort = -1; if (table == "") { arg_error("must specify \"-v table=TBLFILE\"\n"); } if (prefix == "") { arg_error("must specify \"-v prefix=URLPREFIX\"\n"); } split("", tbdate); split("", tbname); read_table(table,tbdate,tbname); } (abort >= 0) { exit abort; } /^[#]/ { print; next; } /^ *$/ { print; next; } /[@]/ { if (abort >= 0) { exit abort; } if (NF < 3) { data_error("not enough input fields\n"); } if ($2 != "@") { data_error("malformed table line\n"); } # Get the URL of the letter {url}: url = $1; # Get the comment (but preserving $0): lin = $0 $2 = ""; $1 = ""; printf " [[%s]]\n", $0 > "/dev/stderr" cm = $0; $0 = lin # Split the prefix {pr} from the URL leaving {sn}: pr = substr(url, 1, length(prefix)) sn = substr(url, length(prefix) + 1) if (pr != prefix) { data_error(("key \"" sn "\" not in table\n")); } # Look up {sn} in table: if (sn in tbdate) { dt = tbdate[sn]; na = tbname[sn] } else { dt = "???-??-??"; na = "???"; } printout(sn, dt, na, cm); next; } // { print; next; } function read_table(fname,tbdate,tbname, ntbl,nlin,lin,linx,fld,nfld,tmp) { ntbl=0; nlin=0; while((getline lin < fname) > 0) { printf " > %s\n", lin > "/dev/stderr" nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { linx = lin; gsub(/ *[#].*$/, "", linx); nfld = split(linx, fld, " "); if (nfld != 3) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); } # If {inv} is true, swap the two columns: if (fld[1] in tbdate) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); } tbdate[fld[1]] = fld[2]; tbname[fld[1]] = fld[3]; ntbl++; } } if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "loaded %6d table entries\n", ntbl > "/dev/stderr" } function printout(sn,dt,na,cm, tx,url) { # prints the HTML table entry for letter number {sn}, date {dt}, name {nm}, comment {cm}. tx = gensub(/["]([^"]+)["]/, "\\1", "g", cm) url = (prefix sn) printf "