#! /usr/bin/gawk -f
# Last edited on 2022-06-15 21:00:57 by stolfi

BEGIN {
  usage = ( \
    "cat INFILE \\\n" \
    "  | add_date_name_to_urls.gawk \\\n" \
    "      -v prefix=URLPREFIX \\\n" \
    "      -v table=TBLFILE \\\n" \
    "  > OUTFILE " \
  );
  
  # Reads an HTML file stdin that contains lines with format 
  # "{SECURL} @ {COMMENT}"
  # where {SECURL} is the URL of a letter sent to the SEC, and {COMMENT} is an arbitrary text.  
   
  # Converts each of those lines to a HTML table line showing the date
  # and sender name of the letter, with a HTML link to the {SECURL}, and
  # the {COMMENT}, suitably formatted. Each line is preceded with an
  # HTML comment with the date and the letter number, for use as a sort
  # key.  The rows of each table are then sorted by that key.
   
  # Other lines are left unchanged.
  
  # Every {SECURL} in the file should start with the given {URLPREFIX}.
  
  # Each line of the {TBLFILE} should have three field {NUMEX} {DATE}
  # {NAME} where {NUMEX} is the URL of the letter minus the {URLPREFIX}
  # but with the extension (".htm" or ".pdf"), {DATE} is the ISO date of
  # the letter, and {NAME} is the sender's name with blanks replaced by
  # "_". Comments starting with "#" and blank lines are ignored in
  # {TBLFILE}.
    
  # Whenever an input {SECURL} is not found in the table, the
  # strings "???-??-??" and "???" are substituted for the date and name.
  
  # In any case, input lines that are blank or begin with "#" are not
  # changed.

  abort = -1;
  if (table == "") { arg_error("must specify \"-v table=TBLFILE\"\n"); }
  if (prefix == "") { arg_error("must specify \"-v prefix=URLPREFIX\"\n"); }

  # URL to date and name table:
  split("", tbdate); # Indexed with the {NUMEX}.
  split("", tbname); # Indexed with the {NUMEX}.
  read_table(table,tbdate,tbname);
  
  # Lines of a section to be sorted:
  section_table_reset()
}

(abort >= 0) { exit abort; }

/^[#]/ { 
  print;
  next;
}

/^ *$/ { 
  # Ignore blank lines inside a "<table>...</table>":
  if (nsecrow < 0) { print; }
  next;
}

/<table>/ {
  # Start of  new section table:
  if (nsecrow >= 0) { data_error("missing '</table>"); }
  section_table_clear();
  next;
}

/[@]/ {
  if (abort >= 0) { exit abort; }
  if (NF < 3) { data_error("not enough input fields\n"); }
  if ($2 != "@") { data_error("malformed table line\n"); }
  if (nsecrow < 0) { data_error("missing <table>"); }

  # Get the URL of the letter {url}:
  url = $1;

  # Get the comment (but preserving $0):
  lin = $0
  $2 = ""; $1 = "";
  printf "  [[%s]]\n", $0 > "/dev/stderr"
  cm = $0;
  $0 = lin
  
  # Split the prefix {pr} from the URL leaving {sn}:
  pr = substr(url, 1, length(prefix))
  sn = substr(url, length(prefix) + 1)
  if (pr != prefix) { data_error(("key \"" sn "\" not in table\n")); }
  
  # Look up {sn} in table:
  if (sn in tbdate) {
    dt = tbdate[sn]; na = tbname[sn]
  } else {
    dt = "???-??-??"; na = "???";
  }
  section_table_save_row(sn, dt, na, cm);
  next;
}

/<[\/]table>/ {
  # End of section table:
  if (nsecrow < 0) { data_error("missing <table>"); }
  section_table_dump();
  section_table_reset();
  next;
}

// { print; next; }

END {
  if (nsecrow >= 0) { data_error("missing </table>"); }
}

function read_table(fname,tbdate,tbname,    ntbl,nlin,lin,linx,fld,nfld,tmp) {
  ntbl=0;
  nlin=0;
  while((getline lin < fname) > 0) { 
    printf "  > %s\n", lin > "/dev/stderr"
    nlin++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { linx = lin; gsub(/ *[#].*$/, "", linx);
        nfld = split(linx, fld, " ");
        if (nfld != 3) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); }
        # If {inv} is true, swap the two columns:
        if (fld[1] in tbdate) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); }
        tbdate[fld[1]] = fld[2];
        tbname[fld[1]] = fld[3];
        ntbl++;
      }
  }
  if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf "loaded %6d table entries\n", ntbl > "/dev/stderr"
}

function section_table_save_row(sn,dt,na,cm,    tx,url) {
  # prints the HTML table entry for letter number {sn}, date {dt}, name {nm}, comment {cm}.
  
  # Protect 'href' fields in comment:
  tx = cm;
  tx = gensub(/href=["]([^"]+)["]/, "href=@<<\\1@>>", "g", tx)
  # Format the quoted text in double quotes:
  tx = gensub(/["]([^"]+)["]/, "<span style=\"color:#006600;\"><i>\\1</i></span>", "g", tx)
  # Unprotect 'href' fields in comment:
  tx = gensub(/href=@<<([^<>]+)@>>/, "href=\"\\1\"", "g", tx)
  
  # Bare number and extension:
  bn = sn; 
  gsub(/^[0-9]+-/, "", bn)
  # Sort key:
  key = ("<!-- " dt " " bn " -->")
  
  # Full letter URL:
  url = (prefix sn)
  
  tbrow = sprintf("%-32s<tr valign=\"top\"><td nowrap><a href=\"%s\">%s</a></td><td><a href=\"%s\">%s</a></br>%s</td></tr>", key, url, dt, url, na, tx);
  nsecrow++
  secrow[nsecrow] = tbrow
}  

# The lines of each table are temporarily saved in {secrow[1..nsecrow]}.
# If {nsecrow} is negative, we are not parsing a table.

function section_table_reset() {
  # Resets the state to 'not inside a table'.
  nsecrow = -1
}

function section_table_clear() {
  # Initializes the section table to empty.
  nsecrow = 0
  split("", secrow) # Indexed from 1 to {nsecrow}
}

function section_table_dump(  i) {
  # Outputs the current section table, with sorted rows.
  
  # Sort the rows:
  ns = asort(secrow)
  if (ns != nsecrow) { prog_error("row count inconsistency"); }
  printf "  <table>\n" 
  printf "\n"
  for (i = 1; i <= nsecrow; i++) {
    printf "    %s\n", secrow[i]
    printf "\n"
  }
  printf "  </table>\n"
}

function arg_error(msg) { 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function tbl_error(f,n,msg) { 
  printf "%s:%d: ** %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg) { 
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function prog_error(msg) { 
  printf "%s:%d: ** prog error: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}