#! /bin/bash

# Converts HTML index pages back to the ".txt" format

names=( "$@" )

for name in ${names[@]}; do 
  hfile="html-2020-05-27/${name}.html"
  tfile="raw-reversed/${name}.txt"
  echo "${hfile} --> ${tfile}" 1>&2
  if [[ -s ${tfile} ]]; then
    echo "** file ${tfile} already exists" 1>&2; exit 1
  fi
  cat ${hfile} \
    | egrep -e 'http[s]?[:]' \
    | sed -E \
        -e 's:^(.*)href="([^"]+)":\2 @@ \1:g' \
        -e 's:<[/]?li>::g' \
    > ${tfile}
done
