#! /usr/bin/perl
# Checks consistency of links within a site
# Created June 1997 by J. Stolfi
# Based on "webxref", written 1995 by Rick Jansen (rick@sara.nl)
# wwwcheck verifies the contents of a site, checking whether
#
# (a) all files are accessible from a given set of root URLs, and
# (b) all links in all pages are valid.
#
# Glossary:
#
# An URL consists of an optional "protocol" ("http:", "ftp:", etc)
# followed by a "locator". The URL is "ordinary" if it begins with
# "http:" or with no protocol. It is "special" if it begins with any
# other protocol (such as "mailto:", "telnet:", "news:", "file:",
# "ftp:", or "gopher:").
#
# The locator of an ordinary URL consists of a possibly empty "path",
# and an optional "qualifier" (from the last "#" onwards; or the
# arguments of a cgi-bin call.).
#
# A "dirpath" is a path that ends with "/".
#
# A path is "global" if it begins with "//", and "local" otherwise.
#
# A "hostpath" is either empty (meaning the local file system)
# or "//" followed by a host name and optional port number
# (meaning an HTTP virtual file system).
#
# A local path is "absolute" if it begins with "/", and "relative"
# otherwise. Global paths are absolute by definition.
#
# "Completing" a local relative path means prefixing to it some
# given hostpath and some absolute dirpath, usually derived from the
# context. Completing an absolute local path means prefixing it with
# a given hostpath. Completing a global path is a no-op. (Note that
# the hostpath may be empty, so the resulting path may be either local
# or global.)
#
# "Localizing" a global path means checking whether some prefix of
# it matches some member of a user-specified list of directory
# paths; and, if it does, replacing that prefix by some local
# directory path. Localizing a local path is a no-op.
#
# "Actualizing" a local path means looking it up in the local
# filesystem, following through symbolic links, and replacing it by
# the actual name of the final file (as defined by "cd" and "pwd").
# This process may be carried out incompletely if the path does not
# name a valid directory. Actualizing a global path is a no-op.
#
# "Normalizing" an ordinary URL means discarding the protocol and
# qualifier parts, and then completing, localizing, and actualizing
# the remaining path. Normalizing a special URL is a no-op.
#
# A "site" is a set of directory paths. (Currently they must be
# local). The "contents" of a site are the files and
# sub-directories in the directories named by those paths. Symbolic
# links are followed whenever they point to valid objects. A path
# is "internal" if its directory part is one of the site's
# directories, and "external" otherwise.
#
# A "page" is a text in HTML format.
#
# A "link" from a page is an URL that is mentioned in that page, either
# as an or
. A "root link"
# is an URL provided through the "-root" option.
#
# "Fetching" a local path means reading the contents of the file named by it.
# If the named file is actully a directory, then the standard entry page names
# ("index.html", "Welcome.html", etc.) are tried, in a fixed order.
# Fetching a global path means accessing it wia the HTTP protocol.
# In either case, the outcome is either a page, or an access error.
#
# An ordinary URL is said to "work" if, after normalization,
# fetching it doesn't yield an error. A special URL is assumed
# to work iff it is not "file:" and is syntactically OK.
#
# An URL is "valid" if it works, or the user has explicitly
# declared it to be valid. An URL is "invalid" if it doesn't work,
# or the user has explicitly declared it to be invalid. (In case
# the user declares an URL to be both valid and invalid, the latter
# takes precedence).
#
# "Testing" a local or global URL means checking whether it works,
# i.e. whether accessing it will not yield an error code.
# "Classifying" an URL means checking whether it is valid or invalid.
# "Scanning" a page means collecting all the URLs in the links in
# that page, normalizing them, and classifying them.
use strict 'subs';
$usage =
"wwwcheck \\
[-localOnly] [-silent] \\
[-valid ERE].. [-invalid ERE].. [-dontScan ERE].. \\
[-root URL].. [-map URL DIR] \\
DIR...";
#---------------------------------------------
# Set $debug = 1 to print internal diagnostic messages
$debug = 0;
#---------------------------------------------
# EnsureDirLike($dir)
#
# Makes sure $dir ends with "/"
sub EnsureDirLike($)
{
my($dir) = $_[0];
if ($dir =~ m#/$#)
{ return $dir; }
else
{ return "$dir/"; }
}
#---------------------------------------------
# EnsureGlobalLike($url)
#
# Makes sure $url begins with "//"
sub EnsureGlobalLike($)
{
my($url) = $_[0];
$url =~ s#^http:##i;
if ($url =~ m#^//#)
{ return "$url"; }
else
{ die "bad global URL = $url, stopped"; }
}
#---------------------------------------------
# EnsureLocalLike($url)
#
# Makes sure $url does not begin with any protocol or "//"
sub EnsureLocalLike($)
{
my($url) = $_[0];
if (($url =~ m#^[a-zA-Z]*:#) || ($url =~ m#^//#))
{ die "bad local filename = $url, stopped"; }
else
{ return $url; }
}
#---------------------------------------------
# GetCWD($default)
#
# Gets the actual name of the current working directory.
# In case of failure, returns $default.
# Makes sure it ends with a slash.
sub GetCWD($)
{
my($default) = $_0;
my($dir) = `pwd`;
if ($dir eq "")
{ $dir = $default; }
else
{ chomp $dir; }
return EnsureDirLike($dir);
}
#---------------------------------------------
# SplitURL($url)
#
# Splits an URL into protocol, path, and qualifier
# e.g. http://www.unicamp.br/~mmh/foobar/.WWW/Welcome.html#foobar
# returns: ('http:', '//www.unicamp.br/~mmh/foobar/.WWW/Welcome.html', '#foobar')
# Any part may be missing.
sub SplitURL($)
{
my($url)=$_[0];
my($qual,$prot);
if ($url =~ m/^([a-z]*:)"/)
{ $prot = $1; $url = $2; }
else
{ $prot = ""; }
if ($url =~ m/^(.*)(#[^#\/]*)$/)
{ $url = $1; $qual = $2; }
elsif ($url =~ m#^(.?*/cgi-bin/.?*)(\?.*)$#)
{ $url = $1; $qual = $2; }
else
{ $qual = ""; }
return ($prot, $url, $qual);
}
#---------------------------------------------
# SplitPath($path)
#
# Splits a path into hostpath, dir, and file.
# e.g. //www.unicamp.br/~mmh/foobar/.WWW/Welcome.html
# returns: ('//www.unicamp.br', '/~mmh/foobar/.WWW/', 'Welcome.html')
# Any part may be empty. If not empty, the directory ends with "/".
# If not empty, the hostpath begins with "//" and contains no other "/".
# The file part contains no "/".
sub SplitPath($)
{
my($file)=$_[0];
my($host,$dir);
if ($file =~ m#^(//[^/]*)(.*)$# )
{ $host = $1; $file = $2; }
else
{ $host = ""; }
if ($file =~ m#^(.*/)([^/]*)$# )
{ $dir = $1; $file = $2; }
else
{ $dir = ""; }
return ($host, $dir, $file);
}
#---------------------------------------------
# CompletePath($host,$dir,path)
#
# Given a hostpath $host (possibly empty), a directory path $dir, and
# an arbitrary path $path, prepends $host and $dir to $path if $path
# is local relative, prepends just $host if $path is local absolute,
# and returns $path unchanged if $path is global. Assumes $dir begins
# and ends with "/", and $host is either empty or begins with "//" and
# contains no other "/"
sub CompletePath ($$$)
{
my($host,$dir,$path) = @_;
if (!($path =~ m#^/#)) { $path = $dir .$path; }
if (!($path =~ m#^//#)) { $path = $host .$path; }
return $path;
}
#---------------------------------------------
# LocalizePath($path)
#
# If $path is local, returns it unchanged.
# If $path is global, checks whether some prefix of $path
# matches some key in %LocalDirs, and if so
# replaces that prefix by the corresponding value.
#
# Assumes the keys in %LocalDirs begin with "//" and end with "/".
# Usually, the corresponding values begin with "/" but not "//"
# and and end with "/".
%LocalDirs = (); # Table mapping global PATHs to local directories.
sub LocalizePath ($)
{
my($path) = $_[0];
if ($path =~ m#^//#)
{
foreach my $key (keys(%LocalDirs))
{ my($len) = length($key);
if ($key eq substr($path, 0, $len))
{
my($res) = $LocalDirs{$key} . substr($path, $len);
return $res;
}
}
}
return $path;
}
#---------------------------------------------
# ActualizePath($path)
#
# If $path is a local absolute path, tries to get the actual absolute pathname
# of the file named by it. If the file doesn't exist, returns some
# partially actualized path. If $path is global, returns $path
# unchanged). Fails if $path is relative.
sub ActualizePath ($)
{
my($path) = $_[0];
my($dir);
if ($path =~ m#^//#)
{ return $path }
elsif (!($path =~ m#^/#))
{ die "non-absolute path = $path, stopped"; }
else
{ my($count) = 0;
do
{ if (-d($path))
{ $dir = EnsureDirLike($path); $path = ""; }
elsif ($path =~ m#^(.*/)([^/]*)# )
{ $dir = $1; $path = $2; }
else
{ # $path is relative to $dir; $dir remains unchanged
# Never happens on the first iteration.
}
if (chdir($dir))
{ $dir = GetCWD($dir);
if (($path ne "") && -l($path))
{ $path = readlink($path); }
else
{ return $dir . $path; }
}
else
{ return $dir . $path; }
++count;
}
until ($count >= 50);
die "too many link hops = $dir$path, stopped";
}
}
#---------------------------------------------
# NormalizeURL($host,$dir,$url)
#
# Assumes $url is a link extracted from some HTML page
# that was found in the absolute local dirpath $dir
# in the hostref $host (which may be empty).
# If $url is a special URL, returns it unchanged.
# Else, discards the "http:" protocol and qualifier, if any,
# and completes, localizes, and actualizes the remaining path.
#
# Assumes $dir begins and ends with "/", and $host
# is either empty or begins with "//" and has no other "/".
sub NormalizeURL ($$$)
{
my($host,$dir,$url) = @_ ;
my($prot,$path,$qual) = SplitURL($url);
if (($prot eq "") || ($prot =~ m#^http:$#i))
{
return ActualizePath(LocalizePath(CompletePath($host,$dir,$path)));
}
return $url;
}
#---------------------------------------------
# Execute directory:
#
$ExecDir = `pwd`;
chomp($ExecDir);
#---------------------------------------------
# Command line arguments:
$LocalOnly = 0; # 1 classifies only links to local URLs
$DontScan = ""; # ERE of pages not to scan
$Valid = ""; # ERE of valid pages
$Invalid = ""; # ERE of invalid pages
$Silent = 0; # If silent=1 only error msgs will be printed
@DirList = (); # Directories in site: unnormalized, local or global.
@RootList = (); # Root URLS, unnormalized.
sub ParseArgs()
{
while ($ARGV[0] =~ /^-/)
{ print "parsing $ARGV[0]\n";
if ($ARGV[0] eq "-help")
{
PrintHelp();
}
elsif ($ARGV[0] eq "-localOnly")
{
$LocalOnly = 1;
}
elsif ($ARGV[0] eq "-silent")
{
$Silent = 1;
}
elsif ($ARGV[0] eq "-map")
{
shift(@ARGV);
my($url) = EnsureDirLike(EnsureGlobalLike($ARGV[0]));
shift(@ARGV);
my($dir) = ActualizePath(EnsureLocalLike($ARGV[0]));
$LocalDirs{$url} = $dir;
print "will map $url --> $dir\n";
}
elsif ($ARGV[0] eq "-dontScan")
{
shift(@ARGV);
if ($DontScan ne "") { $DontScan = $DontScan . "|"; }
$DontScan = $DontScan . $ARGV[0];
}
elsif ($ARGV[0] eq "-valid")
{
shift(@ARGV);
if ($Valid ne "") { $Valid = $Valid . "|"; }
$Valid = $Valid . $ARGV[0];
}
elsif ($ARGV[0] eq "-invalid")
{
shift(@ARGV);
if ($Invalid ne "") { $Invalid = $Invalid . "|"; }
$Invalid = $Invalid . $ARGV[0];
}
elsif ($ARGV[0] eq "-root")
{
shift(@ARGV);
my($page) = EnsureGlobalLike($ARGV[0]);
push(@RootList, $page);
}
else
{
PrintUsage();
}
shift(@ARGV);
}
@DirList = @ARGV;
print "Parsed args\n";
return;
}
#---------------------------------------------
# Site directories:
#
# The table below maps each (normalized) directory that belongs to the site to
# a list of referenced URLS that have that directory.
%SiteDirs = ();
#---------------------------------------------
# Referenced URL tables:
#
# Each URL that has been seen by wwwcheck appears, in normalized form,
# in only one of the hash tables below. In every case, the key is the
# URL in question, and the value is the list of its referents --- the
# URLs of all pages where that URL, and/or the word "[ROOT]" if it was
# given through the "-root" attribute. The referent list is actually
# a string, with " " as the element separator.
#
%GlobalUnclassifiedURLs = (); # Global URLs seen but not classified yet.
%LocalUnclassifiedURLs = (); # Local URLs seen but not classified yet.
%ValidURLs = (); # URLs found to be valid.
%InvalidURLs = (); # URLs found to be invalid.
%UnreferencedURLs = (); # URLs contained in the site but not referenced yet.
#---------------------------------------------
# Main procedure
sub MainProc()
{
ParseArgs();
NormalizeRoots();
NormalizeSiteDirs();
RecClassifyURLs();
PrintLists();
print "All done.\n"
}
#---------------------------------------------
# PushRef($host,$dir,$url,$referent)
#
# Adds a recently seen URL to the URL tables.
# If the URL has already been classified, appends
# its referent to %ValidURLs or %InvalidURLs; else appends it to the
# %GlobalUnclassifiedURLs or %LocalUnclassifiedURLs
sub PushRef ($$$$)
{
my($host,$dir,$url,$referent) = @_;
$url = NormalizeURL($host,$dir,$url);
if (defined($ValidURLs{$url}))
{ $ValidURLs{$url} .= " " . $referent; }
elsif (defined($InvalidURLs{$url}))
{ $InvalidURLs{$url} .= " " . $referent; }
elsif (($url =~ m#^[a-z]*:#i) || ($url =~ m#^//#i))
{ if (defined($GlobalUnclassifiedURLs{$url}))
{ $GlobalUnclassifiedURLs{$url} .= " " . referent }
else
{ $GlobalUnclassifiedURLs{$url} .= referent }
}
else
{ if (defined($LocalUnclassifiedURLs{$url}))
{ $LocalUnclassifiedURLs{$url} .= " " . referent }
else
{ $LocalUnclassifiedURLs{$url} .= referent }
}
}
#---------------------------------------------
# PopUnclassifiedURL ()
#
# Returns a two-element array containing a normalized but still unclassified URL,
# from the %LocalUnclassifiedURLs or %GlobalUnclassifiedURLs,
# and its referents. Returns () if there are none.
sub PopUnclassifiedURL ()
{
if (scalar(%LocalUnclassifiedURLs))
{ return (each(%LocalUnclassifiedURLs)); }
elsif (scalar(%GlobalUnclassifiedURLs))
{ return (each(%LocalUnclassifiedURLs)); }
else
{ return (); }
}
#---------------------------------------------
# NormalizeRoots()
#
# Scans @RootList, normalizes each root URL, inserts it into
# the %LocalUnclassifiedURLs or %GlobalUnclassifiedURLs table.
sub NormalizeRoots ()
{
foreach my $url(@RootList)
{
my($page) = NormalizeURL("", "", $url);
PushRef("", "", $page, "[ROOT]");
}
}
#---------------------------------------------
# NormalizeSiteDirs()
#
# Scans @DirList, normalizes each URL, inserts it into
# %SiteURLs table. Currently requires them to be local
# (after localization).
sub NormalizeSiteDirs ()
{
foreach my $url (@DirList)
{
my($dir);
$dir = EnsureLocalLike(NormalizeURL("", "", EnsureDirLike($url)));
if (!defined($SiteDirs{$dir}))
{ $SiteDirs{$dir} = ""; }
else
{ print "! Duplicate site dir: $url = $dir\n"; }
}
}
#---------------------------------------------
# RecClassifyURLs()
#
# Examines each unclassified URL, classifies it, and adds it
# (with its referent list) to %ValidURLs or %InvalidURLs, as appropriate.
# Whenever it finds a new valid
# internal URL that does not match the $DontScan pattern and yields an
# HTML page, scans that page, and adds its links to the proper
# tables (%LocalUnclassifiedURLs, %ValidURLs, and %InvalidURLs). Assumes the
# URLs in %table have been normalized.
sub RecClassifyURLs()
{
my(@entry) = PopUnclassifiedURL();
while (scalar(@entry))
{
my($url, $refs) = @entry;
if (!($Silent)) { print "Classifying: ${url}\n"; }
my($prot,$path,$qual) = SplitURL($url);
my($valid, $internal);
if ($prot eq "file:")
{ # "file:" URLs are invalid because they don't work elsewhere
$valid = 0; $internal = 0;
}
elsif ($prot =~ m#^(mailto:|ftp:|gopher:|news:)$#)
{ # These special URLs are valid by default.
$valid = 1; $internal = 0;
}
elsif (($prot eq "") || ($prot =~ m#^http:$#i))
{ # Ordinary URL.
$internal = defined($SiteDirs{$path});
if ($url =~ $Invalid) { $valid = 0; }
elsif ($url =~ $Valid) { $valid = 1; }
else { $valid = &TestURL($url); }
}
else
{ print "! Invalid protocol in link = $prot\n";
$InvalidURLs{$url} = $refs;
}
# Move to in proper table, and scan if appropriate:
if ($valid)
{ if (defined($ValidURLs{$url}))
{ die "! duplicate valid URL = $url, stopped"; }
$ValidURLs{$url} = $refs;
if($internal) { ScanURL($path); }
}
else
{ if (defined($InvalidURLs{$url}))
{ die "! duplicate invalid URL = $url, stopped"; }
$InvalidURLs{$url} = $refs;
}
}
}
#---------------------------------------------
# ScanURL($path)
#
# Fetches the file named by $path; if it is HTML, collects all links
# normalizes them, and places them in %LocalUnclassifiedURLs or
# updates %ValidURLs and %InvalidURLs.
sub ScanURL ($)
{
my($path_ = $_[0];
if ($path =~ m#^//#)
{ die "can't scan remote files yet, stopped"; }
else
{ ScanFile($path); }
}
#---------------------------------------------
# Enumerates @DirList and puts their contents in
# %UnreferencedURLs (files and dangling links) and
# %UnreferencedSubDirs (sub-directories)
sub GetSiteContents ()
{
print "! GetSiteContents not written yet\n";
return;
%UnreferencedURLs = ();
%UnreferencedSubDirs = ();
foreach my $dir (keys(%SiteDirs))
{ # Does the diretory exist at all?
stat($dir);
die "Cannot find file $InFile\n" unless -d $dir;
GetDirContents($dir);
}
}
#---------------------------------------------
# Enumerates a given directory and adds its contents to
# %UnreferencedURLs (files and dangling links) and
# %UnreferencedSubDirs (sub-directories)
# The argument is an URL (local or global) that designates a directory.
sub GetDirContents ($)
{
my($dir) = @_;
print "! GetDirContents not written yet\n";
return;
}
MainProc();
#---------------------------------------------
# HTTP status codes and messages
%OkStatusMsgs = (
200, "OK 200",
201, "CREATED 201",
202, "Accepted 202",
203, "Partial Information 203",
204, "No Response 204",
);
%FailStatusMsgs = (
-1, "Could not lookup server",
-2, "Could not open socket",
-3, "Could not bind socket",
-4, "Could not connect",
301, "Found, but moved",
302, "Found, but data resides under different URL (add a /)",
303, "Method",
304, "Not Modified",
400, "Bad request",
401, "Unauthorized",
402, "PaymentRequired",
403, "Forbidden",
404, "Not found",
500, "Internal Error",
501, "Not implemented",
502, "Service temporarily overloaded",
503, "Gateway timeout ",
600, "Bad request",
601, "Not implemented",
602, "Connection failed (host not found?)",
603, "Timed out",
);
#---------------------------------------------
# Files to try in case of a directory reference like ../..
@default_files = (
'Welcome.html',
'welcome.html',
'index.html',
'index.shtml',
'README.html'
);
#---------------------------------------------
# Tests whether an URL is valid.
# If it is a special URL (e.g. "mailto:"), checks the syntax only.
# If it is a local URL, checks whether the file exists and is readable
# (if the URL is a directory, completes it with the standard paths).
# If it is a global URL, tries to fetch it.
# Returns 1 if valid, 0 if invalid.
sub TestURL ($)
{
return 1;
}
#---------------------------------------------
# CleanupLink($link, $file)
#
# Takes a raw link, e.g, `' or
# `
'
# and returns a list of the bare URLs.
#
sub CleanupLink($)
{
my($link) = $_[0] . " ";
my(@urls) = ();
my($url);
$link =~ s/^[ ]*<[ ]*//;
while($link ne "")
{
if ($link =~ m/^[a-zA-Z]+[ ]*=/i)
{ # NAME = VALUE item
$link =~ s/^[a-zA-Z]+[ ]*=[ ]*//i;
if ($link =~ m/^"/)
{ # quoted URL: remove quotes
if ($link =~ m/^".*"/)
{ ($url, $link) = ($link =~ m/^"([^"]*)"(.*)$/);
$url =~ s/^[ ]+//;
$url =~ s/[ ].*$//;
}
else
{ print "! unmatched \" in link = `${link}'\n";
$link =~ s/^".*[ ]+//;
$url = ""
}
else
{ # unquoted URL: delimited by blank or ">"
($url,$link) = ($link =~ m/^([^ >]*)([ >].*|)$/);
}
else
{
}
}
return $link;
}
#---------------------------------------------
# ScanFile($filename)
# Scans a local HTML file, collecting all links.
# Uses the temporary file
$temp_file = "/tmp/webxref.$$";
sub ScanFile ($)
{
my($filename) = $_[0];
my(HTML);
my($text);
if (!(open(HTML, $filename))
{ print "! Could not open file $filename\n"; return; }
# Make sure all $temp_file") || die "Could not create $temp_file\n";
my($offset)=0;
do {
$size = read(HTML,$text,32768,$offset);
$offset += $size;
} until $size != 32768;
close(HTML);
$text =~ s/\n/ /g;
$text =~ s/^[^<]*//;
$text =~ s/(<[^>]*>)[^<]*/\1\n/g;
print TEMP "$text";
$text="";
close(TEMP);
open(HTML, $temp_file) || die "Could not open $temp_file\n";
my(%newlist);
while () {
chop;
s/\s+/ /g; # replace funny spaces by ordinary spaces.
# "; next }
}
else
{ # unquoted URL: delimited by blank or ">"
s/[ >].*$//;
}
# Link to section within current document?
if (m/^#.*/)
{ # ignore for now
}
else
{ # Link to another document
if ($debug) { print "added to newlist: $_\n"; }
$newlist{$_} = 1;
}
}
---
#
.*//;
s/>.*$//i; # remove >'s
s/"//g; # Unquote file names
s/^\s*//; # Remove spaces at start
s/\s*$//; # Remove spaces at end
# Add file to the list
$newlist{$_} = 1;
}
}
close(HTML);
chdir($Old_Dir);
if ($debug) {
# List files
print "\nNewlist:\n";
foreach $file (keys(%newlist)) {
print "$file \n";
}
}
# Walk the list
foreach $file (keys(%newlist)) {
# if file is //something insert a http:
if ($file =~ m#^//.*#) {
$file = "http:" . $file;
}
$Notlocal_file = $dir . $file;
# If file is /something it's a reference from the root document.
# It can also be a cgi-bin reference!
if ($file =~ m#^/cgi-bin/.*#) {
$Notlocal_file = $file;
}
elsif ($file =~ m#^/.*#) {
$Notlocal_file = "$root$file";
}
$Notlocal_ref_filename = $filename;
if ($debug) {
print "\nCalling GR with $Notlocal_file\n";
print "Referenced by: $Notlocal_ref_filename\n";
}
&Get_Refs($Notlocal_file, $Notlocal_ref_filename);
}
unlink($temp_file);
}
__END__
sub Get_Refs {
# Recursively get all referenced files a from the file
local(%newlist);
local($file);
local($dir);
local($Old_Dir);
local($filename);
$dir=&Dir_Name($_[0]);
if ($dir eq "") {
$dir = &Get_PWD;
}
$file=&Base_Name($_[0]);
#print "--------------------\n";
if ($debug) {
print "arg=$_[0]\n";
print "dir=$dir\n";
print "file=$file\n";
}
# directory reference?
if ($file eq "") {
if ($debug) {
print "$dir must be a dir, refd by $_[1]!\n";
}
if (-d $_[0]) {
if (!defined($DirList{$_[0]})) {
$DirList{$_[0]} = $_[1];
}
else {
$DirList{$_[0]} = "$DirList{$_[0]} $_[1]";
}
}
else {
if (!defined($DirNotFoundList{$_[0]})) {
$DirNotFoundList{$_[0]} = $_[1];
}
else {
$DirNotFoundList{$_[0]} = "$DirNotFoundList{$_[0]} $_[1]";
}
}
return;
}
# Move to the specified directory
$Old_Dir = &Get_PWD;
if ($debug) {
print "Chdir to $dir\n";
}
chdir($dir);
$dir=&Get_PWD;
if ($debug) {
print "Now in $dir\n";
}
$filename = $dir . $file;
if (! $Silent) {
print "Checking: $filename\n";
}
# Is it a reference to a specific section? (a file#section reference)
if ($filename =~ m/(.+)#(.+)/) {
$filename = "$1#$2";
if (&CheckQual($1, $2) ) {
#print "** Qual $2 is present in file $1\n";
# Add to the list of qualifs
if (!defined($QualifList{$filename})) {
$QualifList{$filename} = $_[1];
}
else {
$QualifList{$filename} = "$QualifList{$filename} $_[1]";
}
}
else {
print "xx Qual $2 is NOT present in file $1\n";
print "xx Referenced by: $_[1]\n";
#print "Qual filename: $filename\n";
# Add to the list of lost qualifs
if (!defined($LostQualifList{$filename})) {
$LostQualifList{$filename} = $_[1];
}
else {
$LostQualifList{$filename} = "$LostQualifList{$filename} $_[1]";
}
}
return;
}
#
# Add to the list of already tested files
#
# If the "file" is a directory try Welcome/welcome/index.html
if (-d $filename) {
#print "xx $filename is a directory, trying Welcome/welcome/index.html.\n";
$found = 0;
foreach $default_file (@default_files) {
#print "Trying $default_file\n";
if (-f ($file . '/' . $default_file)) {
$dirname=$filename;
$file= $default_file;
$found = 1;
last;
}
}
if (! $found) {
print "xx No Welcome/welcome/index.html can be found in $filename\n";
print "xx Referenced by: $_[1]\n";
# Add to list of lost files
if (!defined($LostFileList{$filename})) {
$LostFileList{$filename} = $_[1];
}
else {
$LostFileList{$filename} = "$LostFileList{$filename} $_[1]";
}
return;
}
# Move to the specified directory
if ($debug) {
print "Chdir to $dirname\n";
}
chdir($dirname);
$dir=&Get_PWD;
if ($debug) {
print "Now in $dir\n";
}
$filename = $dir . $file;
if ($debug) {
print "** Filename is now: $filename\n";
print "** Dirname is now: $dir\n";
}
}
if (! -f $filename) {
print "xx $filename cannot be found\n";
print "xx Referenced by: $_[1]\n";
# Add to list of lost files
if (!defined($LostFileList{$filename})) {
$LostFileList{$filename} = $_[1];
}
else {
$LostFileList{$filename} = "$LostFileList{$filename} $_[1]";
}
return;
}
# Binary file? (pictures,...)
if (-B $filename) {
if ($debug) {
print "** Binary file added to images";
}
if (defined($ImageFileList{$filename})) {
return;
}
if (!defined($ImageFileList{$filename})) {
$ImageFileList{$filename} = $_[1]; # Define!
}
else {
$ImageFileList{$filename} = "$ImageFileList{$filename} $_[1]";
}
if ($debug) {
print "\n\nAdded: $filename to list of images\n";
}
return;
}
# else it's a text (html)file
if (!defined($FileList{$filename})) {
$FileList{$filename} = $_[1]; # Define!
}
else {
$FileList{$filename} = "$FileList{$filename} $_[1]";
return; # Already did this file
}
if ($debug) {
print "** Added: $filename \n";
}
# World readable?
($_,$_,$mode) = stat($filename);
$readmode = ($mode & 4);
if ($readmode == 0) {
# Not world readable, add to list
#print "xx Warning: $filename is not world readable\n";
if (!defined($UnreadableList{$filename})) {
$UnreadableList{$filename} = $_[1];
}
else {
$UnreadableList{$filename} = "$UnreadableList{$filename} $_[1]";
}
}
if ($HTML_only) {
# Filename *must* have extension .html, else we don't inspect it.
if ($filename !~ /.*\.html$/i) {return;}
}
# Apply the regexp to avoid certain files
if ($Avoid ne "") {
if ($filename =~ m/$Avoid/) {
print "** The above file is avoided.\n";
return;
}
}
} #sub Get_Refs
# Check external URLs
if ($NOT LocalOnly) {
if (! $Silent) {
print <<"E_O_T";
- - - - - - - - - - - - - - - - - - - - - - - - - - -
Going to really check external URLs via the network.
This may take some time. Simply abort webxref if you
are out of patience.
- - - - - - - - - - - - - - - - - - - - - - - - - - -
E_O_T
}
&http(%HTTPList, "Checking external URLs:");
print "\nAll done.\n";
}
exit;
sub PrintUsage {
print <<"E_O_T";
Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html
E_O_T
exit;
}
sub PrintHelp {
print <<"E_O_T";
Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html
-nohttp: do not check external URLs
-htmlonly: only inspect files with the .html extension
-silent: only output error/problem messages
-avoid regexp: avoid files with names matching regexp for inspection
Examples
webxref file.html
checks file.html and files/URLs referenced from file.html
webxref -nohttp file.html
checks file.html, but not external URLs
webxref -htmlonly file.html
checks file.html, but only files with the .html extension
webxref -avoid '.*Archive.*' file.html
checks file.html but avoids files with names containing
'Archive'
webxref -avoid '.*Archive.*|.*Distribution.*' file.html
Same as above, but also files with names containing
E_O_T
exit;
}
#---------------------------------------------
sub Base_Name {
# return basename,
# e.g. /home/sscprick/.WWW/Welcome.html
# returns: Welcome.html
local($local_filename)=$_[0];
$local_filename =~ s#.*/##; # remove the directory name -> file name
$local_filename;
}
sub Dir_Name {
# return dirname,
# e.g. /home/sscprick/.WWW/Welcome.html
# returns: /home/sscprick/.WWW/
local($local_filename)=$_[0];
$local_filename =~ s#.*/##; # remove the directory name -> file name
local($local_dirname) = $_[0];
$local_filename =~ s/(\W)/\\$1/g; # escape regexp chars
$local_dirname =~ s/$local_filename$//; # wipe filename at end -> dir name
$local_dirname;
}
sub CheckQual {
# See if #section qual is present in file
local($fn, $qual) = @_;
$qual =~ s/(\W)/\\$1/g; # quote rexep chars
open(CH_HTML, $fn) || die "xx Could not open $fn\n";
while () {
chop;
if (/\n URL: $URL\n host: $host\n port: $port\n path: $path\n";
}
# The following is largely taken from the Camel book, chapter 6
$AF_INET = 2;
$SOCK_STREAM = 1;
$sockaddr = 'S n a4 x8';
chop($hostname = `hostname`);
($name,$aliases,$proto) = getprotobyname('tcp');
($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/;
($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname);
if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) {
return -1;
}
$this = pack($sockaddr, $AF_INET, 0, $thisaddr);
$that = pack($sockaddr, $AF_INET, $port, $thataddr);
# Make the socket filehandle.
# ** Temporary fix, this is NOT The way to do it. 15-APR-96
if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) {
$SOCK_STREAM = 2;
if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; }
}
# Give the socket an address
if (!(bind(S, $this))) {
return -3;
}
if (!(connect(S,$that))) {
return -4;
}
select(S); $| = 1; select(STDOUT);
print S "HEAD $path HTTP/1.0\n\n";
$response = ;
($protocol, $status) = split(/ /, $response);
while () {
#print;
}
close(S);
#print "Status: $status\n";
return $status;
}
#---------------------------------------------
sub Print_List {
local(%list, $header) = @_;
local($file);
# Don't list empty lists
if (! %list) {return};
print "\n\n----------------\n$header\n";
@TheList=keys(%list);
@SortedList = sort @TheList;
foreach $file (@SortedList) {
print "$file \n";
@lost = split(/ /,$list{$file});
@sortlost = sort @lost;
print " Referenced by:\n";
foreach $lostfile (@sortlost) {
print " $lostfile\n";
}
}
} # sub Print_List
sub Print_Lists {
# Print lists
# List all files found
if (!$Silent) { &Print_List(%FileList,"Web documents found:");}
# List of directories referenced
if (!$Silent) { &Print_List(%DirList,"Directories:");}
# List of images referenced
if (!$Silent) { &Print_List(%ImageFileList,"Images:");}
# List of mailto's
if (!$Silent) { &Print_List(%MailList,"Mailto:");}
# List of ftp's
if (!$Silent) { &Print_List(%FTPList,"ftp:");}
# List of telnets
if (!$Silent) { &Print_List(%TelnetList,"telnet:");}
# List of gophers
if (!$Silent) { &Print_List(%GopherList,"gopher:");}
# List of news
if (!$Silent) { &Print_List(%NewsList,"News:");}
# List of http's
if (!$Silent) { &Print_List(%HTTPList,"External URLs:");}
# List of file:'s
if (!$Silent) { &Print_List(%ExtFileList,"External file:");}
# List of cgi-bin scripts/forms
if (!$Silent) { &Print_List(%CGIList,"cgi-bin scripts/forms:");}
# List of name qualifs
if (!$Silent) { &Print_List(%QualifList,"Name qualifs found:");}
# List of files that can't be found
&Print_List(%LostFileList,"Files not found:");
# List of files that are not world readable
&Print_List(%UnreadableList,"Files not world readable:");
# List of directories that can't be found
&Print_List(%DirNotFoundList,"Directories not found:");
# List of name qualifs not found
&Print_List(%LostQualifList,"Name qualifs not found:");
if ($HTML_only) { print "\nDone.\n"; }
} #sub Print_Lists
# This is the last line of the webxref script really.
# If this line is missi