#! /usr/bin/gawk -f # Last edited on 2004-01-25 18:24:28 by stolfi # Reads an HTML file and joins/breaks it into lines containing # (a) only one tag, or (b) plain text. # Replaces tabs ('\011') by space, removes ESC ('\013'), FF ('\014'), VT ('\015') BEGIN { abort = -1; buf = ""; sep = ""; } /./ { gsub(/[\t]/, " ", $0); } /./ { gsub(/[\f\v\r]/, "", $0); } (abort >= 0) { exit abort; } /./ { buf = ( buf sep $0); sep = "\n"; while(splitbuf()) { } } END { if (abort >= 0) { exit abort; } buf = ( buf "<"); while(splitbuf()) { }; if (buf != "<") { error(("huh?")); } } function splitbuf( obj) { # If "buf" begins with a complete tag, badly closed tag, or # non-blank string terminated by a tag, outputs it and # returns 1; otherwise returns 0. if (match(buf, /^[^<>]+[<>]/)) { # Non-tag followed by tag obj = substr(buf, 1, RLENGTH-1); } else if (match(buf, /^ *<[^<>]*>/)) { # Complete tag obj = substr(buf, 1, RLENGTH); } else if (match(buf, /^ *<[^<>]*/)) { # Spurious ">" obj = substr(buf, 1, RLENGTH); } else { return 0; } buf = substr(buf, length(obj)+1); gsub(/^[ \n]*/, "", obj); gsub(/[ \n]*[\n][ \n]*/, "\n", obj); gsub(/[ \n]*$/, "", obj); if (substr(obj,1,1) == "<") { gsub(/[\n]+/, " ", obj); } else { gsub(/[\n][\n]+/, "\n", obj); } if (obj != "") { print obj; } return 1; } function error(msg) { printf "line %d: %s\n", FNR, msg; abort = 1; exit abort; }