#! /usr/bin/gawk -f
# Last edited on 2022-06-15 21:00:57 by stolfi
BEGIN {
usage = ( \
"cat INFILE \\\n" \
" | add_date_name_to_urls.gawk \\\n" \
" -v prefix=URLPREFIX \\\n" \
" -v table=TBLFILE \\\n" \
" > OUTFILE " \
);
# Reads an HTML file stdin that contains lines with format
# "{SECURL} @ {COMMENT}"
# where {SECURL} is the URL of a letter sent to the SEC, and {COMMENT} is an arbitrary text.
# Converts each of those lines to a HTML table line showing the date
# and sender name of the letter, with a HTML link to the {SECURL}, and
# the {COMMENT}, suitably formatted. Each line is preceded with an
# HTML comment with the date and the letter number, for use as a sort
# key. The rows of each table are then sorted by that key.
# Other lines are left unchanged.
# Every {SECURL} in the file should start with the given {URLPREFIX}.
# Each line of the {TBLFILE} should have three field {NUMEX} {DATE}
# {NAME} where {NUMEX} is the URL of the letter minus the {URLPREFIX}
# but with the extension (".htm" or ".pdf"), {DATE} is the ISO date of
# the letter, and {NAME} is the sender's name with blanks replaced by
# "_". Comments starting with "#" and blank lines are ignored in
# {TBLFILE}.
# Whenever an input {SECURL} is not found in the table, the
# strings "???-??-??" and "???" are substituted for the date and name.
# In any case, input lines that are blank or begin with "#" are not
# changed.
abort = -1;
if (table == "") { arg_error("must specify \"-v table=TBLFILE\"\n"); }
if (prefix == "") { arg_error("must specify \"-v prefix=URLPREFIX\"\n"); }
# URL to date and name table:
split("", tbdate); # Indexed with the {NUMEX}.
split("", tbname); # Indexed with the {NUMEX}.
read_table(table,tbdate,tbname);
# Lines of a section to be sorted:
section_table_reset()
}
(abort >= 0) { exit abort; }
/^[#]/ {
print;
next;
}
/^ *$/ {
# Ignore blank lines inside a "
":
if (nsecrow < 0) { print; }
next;
}
// {
# Start of new section table:
if (nsecrow >= 0) { data_error("missing '
"); }
section_table_clear();
next;
}
/[@]/ {
if (abort >= 0) { exit abort; }
if (NF < 3) { data_error("not enough input fields\n"); }
if ($2 != "@") { data_error("malformed table line\n"); }
if (nsecrow < 0) { data_error("missing "); }
# Get the URL of the letter {url}:
url = $1;
# Get the comment (but preserving $0):
lin = $0
$2 = ""; $1 = "";
printf " [[%s]]\n", $0 > "/dev/stderr"
cm = $0;
$0 = lin
# Split the prefix {pr} from the URL leaving {sn}:
pr = substr(url, 1, length(prefix))
sn = substr(url, length(prefix) + 1)
if (pr != prefix) { data_error(("key \"" sn "\" not in table\n")); }
# Look up {sn} in table:
if (sn in tbdate) {
dt = tbdate[sn]; na = tbname[sn]
} else {
dt = "???-??-??"; na = "???";
}
section_table_save_row(sn, dt, na, cm);
next;
}
/<[\/]table>/ {
# End of section table:
if (nsecrow < 0) { data_error("missing "); }
section_table_dump();
section_table_reset();
next;
}
// { print; next; }
END {
if (nsecrow >= 0) { data_error("missing
"); }
}
function read_table(fname,tbdate,tbname, ntbl,nlin,lin,linx,fld,nfld,tmp) {
ntbl=0;
nlin=0;
while((getline lin < fname) > 0) {
printf " > %s\n", lin > "/dev/stderr"
nlin++;
if (! match(lin, /^[ \011]*([#]|$)/))
{ linx = lin; gsub(/ *[#].*$/, "", linx);
nfld = split(linx, fld, " ");
if (nfld != 3) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); }
# If {inv} is true, swap the two columns:
if (fld[1] in tbdate) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); }
tbdate[fld[1]] = fld[2];
tbname[fld[1]] = fld[3];
ntbl++;
}
}
if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); }
close (fname);
if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
printf "loaded %6d table entries\n", ntbl > "/dev/stderr"
}
function section_table_save_row(sn,dt,na,cm, tx,url) {
# prints the HTML table entry for letter number {sn}, date {dt}, name {nm}, comment {cm}.
# Protect 'href' fields in comment:
tx = cm;
tx = gensub(/href=["]([^"]+)["]/, "href=@<<\\1@>>", "g", tx)
# Format the quoted text in double quotes:
tx = gensub(/["]([^"]+)["]/, "\\1", "g", tx)
# Unprotect 'href' fields in comment:
tx = gensub(/href=@<<([^<>]+)@>>/, "href=\"\\1\"", "g", tx)
# Bare number and extension:
bn = sn;
gsub(/^[0-9]+-/, "", bn)
# Sort key:
key = ("")
# Full letter URL:
url = (prefix sn)
tbrow = sprintf("%-32s%s | %s%s |
", key, url, dt, url, na, tx);
nsecrow++
secrow[nsecrow] = tbrow
}
# The lines of each table are temporarily saved in {secrow[1..nsecrow]}.
# If {nsecrow} is negative, we are not parsing a table.
function section_table_reset() {
# Resets the state to 'not inside a table'.
nsecrow = -1
}
function section_table_clear() {
# Initializes the section table to empty.
nsecrow = 0
split("", secrow) # Indexed from 1 to {nsecrow}
}
function section_table_dump( i) {
# Outputs the current section table, with sorted rows.
# Sort the rows:
ns = asort(secrow)
if (ns != nsecrow) { prog_error("row count inconsistency"); }
printf " \n"
printf "\n"
for (i = 1; i <= nsecrow; i++) {
printf " %s\n", secrow[i]
printf "\n"
}
printf "
\n"
}
function arg_error(msg) {
printf "%s\n", msg > "/dev/stderr";
printf "usage: %s\n", usage > "/dev/stderr";
abort = 1;
exit 1
}
function tbl_error(f,n,msg) {
printf "%s:%d: ** %s\n", f, n, msg > "/dev/stderr";
abort = 1;
exit 1
}
function data_error(msg) {
printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
abort = 1;
exit 1
}
function prog_error(msg) {
printf "%s:%d: ** prog error: %s\n", FILENAME, FNR, msg > "/dev/stderr";
abort = 1;
exit 1
}