#! /usr/bin/gawk -f
# Last edited on 2004-01-25 18:24:28 by stolfi

# Reads an HTML file and joins/breaks it into lines containing 
# (a) only one tag, or (b) plain text.
# Replaces tabs ('\011') by space, removes ESC ('\013'), FF ('\014'), VT ('\015') 

BEGIN { 
  abort = -1;
  buf = "";
  sep = "";
}

/./ { gsub(/[\t]/, " ", $0); }
/./ { gsub(/[\f\v\r]/, "", $0); }

(abort >= 0) { exit abort; }

/./ {
  buf = ( buf sep $0); sep = "\n";
  while(splitbuf()) { }
}

END {
  if (abort >= 0) { exit abort; }
  buf = ( buf "<");
  while(splitbuf()) { };
  if (buf != "<") { error(("huh?")); }
}

function splitbuf(  obj)
{
  # If "buf" begins with a complete tag, badly closed tag, or
  # non-blank string terminated by a tag, outputs it and
  # returns 1; otherwise returns 0.
  
  if (match(buf, /^[^<>]+[<>]/))
    { # Non-tag followed by tag
      obj = substr(buf, 1, RLENGTH-1);
    }
  else if (match(buf, /^ *<[^<>]*>/))
    { # Complete tag
      obj = substr(buf, 1, RLENGTH);
    }
  else if (match(buf, /^ *<[^<>]*</))
    { # Improperly closed tag
      obj = substr(buf, 1, RLENGTH);
    }
  else if (match(buf, /^ *>/))
    { # Spurious ">"
      obj = substr(buf, 1, RLENGTH);
    }
  else
    { return 0; }
  buf = substr(buf, length(obj)+1);
  gsub(/^[ \n]*/, "", obj);
  gsub(/[ \n]*[\n][ \n]*/, "\n", obj);
  gsub(/[ \n]*$/, "", obj);
  if (substr(obj,1,1) == "<") 
    { gsub(/[\n]+/, " ", obj); }
  else
    { gsub(/[\n][\n]+/, "\n", obj); }
  if (obj != "") { print obj; }
  return 1;
}

function error(msg)
{
  printf "line %d: %s\n", FNR, msg;
  abort = 1;
  exit abort;
}