#! /usr/bin/gawk -f # Last edited on 2000-05-09 01:06:16 by stolfi # Assumes the file has been converted to lower case. /./ { gsub(/[•]/, " ", $0); gsub(/]*>/, " ", $0); gsub(/<(meta|META)[^<>]*>/, " ", $0); gsub(/<[\/]*(b|B)>/, " ", $0); gsub(/<[\/]*(body|BODY)[^<>]*>/, " ", $0); gsub(/<[\/]*(head|HEAD)[^<>]*>/, " ", $0); gsub(/<[\/]*(html|HTML)[^<>]*>/, " ", $0); gsub(/<[\/]*(table|TABLE)[^<>]*>/, " ", $0); gsub(/<(td|TD)>[0-9]*<\/(td|TD)>/, " ", $0); gsub(/<[\/]*(hr|HR)[^<>]*>/, " ", $0); gsub(/<[\/]*(tr|TR)[^<>]*>/, " ", $0); gsub(/<[\/]*(td|TD)[^<>]*>/, " ", $0); gsub(/<[\/]*(body|BODY)[^<>]*>/, " ", $0); gsub(/^[ ]*[0-9][0-9]*[ ]*$/, " ", $0); gsub(/^[ ]*f[0-9][0-9]*[rv][0-9rv,]*[ ]*$/, " ", $0); gsub(/[(][^()<>]*[)]/, " ", $0); gsub(/[(][^()<>]*[)]/, " ", $0); gsub(/^[ ]*[()]*[ ]*$/, " ", $0); gsub(/Total words/, " ", $0); gsub(/Total word/, " ", $0); gsub(/Patterns of word/, " ", $0); gsub(/Pattens of word/, " ", $0); # Page header lines: $0 = gensub(/