#! /usr/bin/perl # Checks consistency of links within a site # Created June 1997 by J. Stolfi # Based on "webxref", written 1995 by Rick Jansen (rick@sara.nl) # wwwcheck verifies the contents of a site, checking whether # # (a) all files are accessible from a given set of root URLs, and # (b) all links in all pages are valid. # # Glossary: # # An URL consists of an optional "protocol" ("http:", "ftp:", etc) # followed by a "locator". The URL is "ordinary" if it begins with # "http:" or with no protocol. It is "special" if it begins with any # other protocol (such as "mailto:", "telnet:", "news:", "file:", # "ftp:", or "gopher:"). # # The locator of an ordinary URL consists of a possibly empty "path", # and an optional "qualifier" (from the last "#" onwards; or the # arguments of a cgi-bin call.). # # A "dirpath" is a path that ends with "/". # # A path is "global" if it begins with "//", and "local" otherwise. # # A "hostpath" is either empty (meaning the local file system) # or "//" followed by a host name and optional port number # (meaning an HTTP virtual file system). # # A local path is "absolute" if it begins with "/", and "relative" # otherwise. Global paths are absolute by definition. # # "Completing" a local relative path means prefixing to it some # given hostpath and some absolute dirpath, usually derived from the # context. Completing an absolute local path means prefixing it with # a given hostpath. Completing a global path is a no-op. (Note that # the hostpath may be empty, so the resulting path may be either local # or global.) # # "Localizing" a global path means checking whether some prefix of # it matches some member of a user-specified list of directory # paths; and, if it does, replacing that prefix by some local # directory path. Localizing a local path is a no-op. # # "Actualizing" a local path means looking it up in the local # filesystem, following through symbolic links, and replacing it by # the actual name of the final file (as defined by "cd" and "pwd"). # This process may be carried out incompletely if the path does not # name a valid directory. Actualizing a global path is a no-op. # # "Normalizing" an ordinary URL means discarding the protocol and # qualifier parts, and then completing, localizing, and actualizing # the remaining path. Normalizing a special URL is a no-op. # # A "site" is a set of directory paths. (Currently they must be # local). The "contents" of a site are the files and # sub-directories in the directories named by those paths. Symbolic # links are followed whenever they point to valid objects. A path # is "internal" if its directory part is one of the site's # directories, and "external" otherwise. # # A "page" is a text in HTML format. # # A "link" from a page is an URL that is mentioned in that page, either # as an or

' # and returns a list of the bare URLs. # sub CleanupLink($) { my($link) = $_[0] . " "; my(@urls) = (); my($url); $link =~ s/^[ ]*<[ ]*//; while($link ne "") { if ($link =~ m/^[a-zA-Z]+[ ]*=/i) { # NAME = VALUE item $link =~ s/^[a-zA-Z]+[ ]*=[ ]*//i; if ($link =~ m/^"/) { # quoted URL: remove quotes if ($link =~ m/^".*"/) { ($url, $link) = ($link =~ m/^"([^"]*)"(.*)$/); $url =~ s/^[ ]+//; $url =~ s/[ ].*$//; } else { print "! unmatched \" in link = `${link}'\n"; $link =~ s/^".*[ ]+//; $url = "" } else { # unquoted URL: delimited by blank or ">" ($url,$link) = ($link =~ m/^([^ >]*)([ >].*|)$/); } else { } } return $link; } #--------------------------------------------- # ScanFile($filename) # Scans a local HTML file, collecting all links. # Uses the temporary file $temp_file = "/tmp/webxref.$$"; sub ScanFile ($) { my($filename) = $_[0]; my(HTML); my($text); if (!(open(HTML, $filename)) { print "! Could not open file $filename\n"; return; } # Make sure all $temp_file") || die "Could not create $temp_file\n"; my($offset)=0; do { $size = read(HTML,$text,32768,$offset); $offset += $size; } until $size != 32768; close(HTML); $text =~ s/\n/ /g; $text =~ s/^[^<]*//; $text =~ s/(<[^>]*>)[^<]*/\1\n/g; print TEMP "$text"; $text=""; close(TEMP); open(HTML, $temp_file) || die "Could not open $temp_file\n"; my(%newlist); while () { chop; s/\s+/ /g; # replace funny spaces by ordinary spaces. # "; next } } else { # unquoted URL: delimited by blank or ">" s/[ >].*$//; } # Link to section within current document? if (m/^#.*/) { # ignore for now } else { # Link to another document if ($debug) { print "added to newlist: $_\n"; } $newlist{$_} = 1; } } --- #

.*//; s/>.*$//i; # remove >'s s/"//g; # Unquote file names s/^\s*//; # Remove spaces at start s/\s*$//; # Remove spaces at end # Add file to the list $newlist{$_} = 1; } } close(HTML); chdir($Old_Dir); if ($debug) { # List files print "\nNewlist:\n"; foreach $file (keys(%newlist)) { print "$file \n"; } } # Walk the list foreach $file (keys(%newlist)) { # if file is //something insert a http: if ($file =~ m#^//.*#) { $file = "http:" . $file; } $Notlocal_file = $dir . $file; # If file is /something it's a reference from the root document. # It can also be a cgi-bin reference! if ($file =~ m#^/cgi-bin/.*#) { $Notlocal_file = $file; } elsif ($file =~ m#^/.*#) { $Notlocal_file = "$root$file"; } $Notlocal_ref_filename = $filename; if ($debug) { print "\nCalling GR with $Notlocal_file\n"; print "Referenced by: $Notlocal_ref_filename\n"; } &Get_Refs($Notlocal_file, $Notlocal_ref_filename); } unlink($temp_file); } __END__ sub Get_Refs { # Recursively get all referenced files a from the file local(%newlist); local($file); local($dir); local($Old_Dir); local($filename); $dir=&Dir_Name($_[0]); if ($dir eq "") { $dir = &Get_PWD; } $file=&Base_Name($_[0]); #print "--------------------\n"; if ($debug) { print "arg=$_[0]\n"; print "dir=$dir\n"; print "file=$file\n"; } # directory reference? if ($file eq "") { if ($debug) { print "$dir must be a dir, refd by $_[1]!\n"; } if (-d $_[0]) { if (!defined($DirList{$_[0]})) { $DirList{$_[0]} = $_[1]; } else { $DirList{$_[0]} = "$DirList{$_[0]} $_[1]"; } } else { if (!defined($DirNotFoundList{$_[0]})) { $DirNotFoundList{$_[0]} = $_[1]; } else { $DirNotFoundList{$_[0]} = "$DirNotFoundList{$_[0]} $_[1]"; } } return; } # Move to the specified directory $Old_Dir = &Get_PWD; if ($debug) { print "Chdir to $dir\n"; } chdir($dir); $dir=&Get_PWD; if ($debug) { print "Now in $dir\n"; } $filename = $dir . $file; if (! $Silent) { print "Checking: $filename\n"; } # Is it a reference to a specific section? (a file#section reference) if ($filename =~ m/(.+)#(.+)/) { $filename = "$1#$2"; if (&CheckQual($1, $2) ) { #print "** Qual $2 is present in file $1\n"; # Add to the list of qualifs if (!defined($QualifList{$filename})) { $QualifList{$filename} = $_[1]; } else { $QualifList{$filename} = "$QualifList{$filename} $_[1]"; } } else { print "xx Qual $2 is NOT present in file $1\n"; print "xx Referenced by: $_[1]\n"; #print "Qual filename: $filename\n"; # Add to the list of lost qualifs if (!defined($LostQualifList{$filename})) { $LostQualifList{$filename} = $_[1]; } else { $LostQualifList{$filename} = "$LostQualifList{$filename} $_[1]"; } } return; } # # Add to the list of already tested files # # If the "file" is a directory try Welcome/welcome/index.html if (-d $filename) { #print "xx $filename is a directory, trying Welcome/welcome/index.html.\n"; $found = 0; foreach $default_file (@default_files) { #print "Trying $default_file\n"; if (-f ($file . '/' . $default_file)) { $dirname=$filename; $file= $default_file; $found = 1; last; } } if (! $found) { print "xx No Welcome/welcome/index.html can be found in $filename\n"; print "xx Referenced by: $_[1]\n"; # Add to list of lost files if (!defined($LostFileList{$filename})) { $LostFileList{$filename} = $_[1]; } else { $LostFileList{$filename} = "$LostFileList{$filename} $_[1]"; } return; } # Move to the specified directory if ($debug) { print "Chdir to $dirname\n"; } chdir($dirname); $dir=&Get_PWD; if ($debug) { print "Now in $dir\n"; } $filename = $dir . $file; if ($debug) { print "** Filename is now: $filename\n"; print "** Dirname is now: $dir\n"; } } if (! -f $filename) { print "xx $filename cannot be found\n"; print "xx Referenced by: $_[1]\n"; # Add to list of lost files if (!defined($LostFileList{$filename})) { $LostFileList{$filename} = $_[1]; } else { $LostFileList{$filename} = "$LostFileList{$filename} $_[1]"; } return; } # Binary file? (pictures,...) if (-B $filename) { if ($debug) { print "** Binary file added to images"; } if (defined($ImageFileList{$filename})) { return; } if (!defined($ImageFileList{$filename})) { $ImageFileList{$filename} = $_[1]; # Define! } else { $ImageFileList{$filename} = "$ImageFileList{$filename} $_[1]"; } if ($debug) { print "\n\nAdded: $filename to list of images\n"; } return; } # else it's a text (html)file if (!defined($FileList{$filename})) { $FileList{$filename} = $_[1]; # Define! } else { $FileList{$filename} = "$FileList{$filename} $_[1]"; return; # Already did this file } if ($debug) { print "** Added: $filename \n"; } # World readable? ($_,$_,$mode) = stat($filename); $readmode = ($mode & 4); if ($readmode == 0) { # Not world readable, add to list #print "xx Warning: $filename is not world readable\n"; if (!defined($UnreadableList{$filename})) { $UnreadableList{$filename} = $_[1]; } else { $UnreadableList{$filename} = "$UnreadableList{$filename} $_[1]"; } } if ($HTML_only) { # Filename *must* have extension .html, else we don't inspect it. if ($filename !~ /.*\.html$/i) {return;} } # Apply the regexp to avoid certain files if ($Avoid ne "") { if ($filename =~ m/$Avoid/) { print "** The above file is avoided.\n"; return; } } } #sub Get_Refs # Check external URLs if ($NOT LocalOnly) { if (! $Silent) { print <<"E_O_T"; - - - - - - - - - - - - - - - - - - - - - - - - - - - Going to really check external URLs via the network. This may take some time. Simply abort webxref if you are out of patience. - - - - - - - - - - - - - - - - - - - - - - - - - - - E_O_T } &http(%HTTPList, "Checking external URLs:"); print "\nAll done.\n"; } exit; sub PrintUsage { print <<"E_O_T"; Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html E_O_T exit; } sub PrintHelp { print <<"E_O_T"; Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html -nohttp: do not check external URLs -htmlonly: only inspect files with the .html extension -silent: only output error/problem messages -avoid regexp: avoid files with names matching regexp for inspection Examples webxref file.html checks file.html and files/URLs referenced from file.html webxref -nohttp file.html checks file.html, but not external URLs webxref -htmlonly file.html checks file.html, but only files with the .html extension webxref -avoid '.*Archive.*' file.html checks file.html but avoids files with names containing 'Archive' webxref -avoid '.*Archive.*|.*Distribution.*' file.html Same as above, but also files with names containing E_O_T exit; } #--------------------------------------------- sub Base_Name { # return basename, # e.g. /home/sscprick/.WWW/Welcome.html # returns: Welcome.html local($local_filename)=$_[0]; $local_filename =~ s#.*/##; # remove the directory name -> file name $local_filename; } sub Dir_Name { # return dirname, # e.g. /home/sscprick/.WWW/Welcome.html # returns: /home/sscprick/.WWW/ local($local_filename)=$_[0]; $local_filename =~ s#.*/##; # remove the directory name -> file name local($local_dirname) = $_[0]; $local_filename =~ s/(\W)/\\$1/g; # escape regexp chars $local_dirname =~ s/$local_filename$//; # wipe filename at end -> dir name $local_dirname; } sub CheckQual { # See if #section qual is present in file local($fn, $qual) = @_; $qual =~ s/(\W)/\\$1/g; # quote rexep chars open(CH_HTML, $fn) || die "xx Could not open $fn\n"; while () { chop; if (/\n URL: $URL\n host: $host\n port: $port\n path: $path\n"; } # The following is largely taken from the Camel book, chapter 6 $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; chop($hostname = `hostname`); ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/; ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) { return -1; } $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); # Make the socket filehandle. # ** Temporary fix, this is NOT The way to do it. 15-APR-96 if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { $SOCK_STREAM = 2; if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; } } # Give the socket an address if (!(bind(S, $this))) { return -3; } if (!(connect(S,$that))) { return -4; } select(S); $| = 1; select(STDOUT); print S "HEAD $path HTTP/1.0\n\n"; $response = ; ($protocol, $status) = split(/ /, $response); while () { #print; } close(S); #print "Status: $status\n"; return $status; } #--------------------------------------------- sub Print_List { local(%list, $header) = @_; local($file); # Don't list empty lists if (! %list) {return}; print "\n\n----------------\n$header\n"; @TheList=keys(%list); @SortedList = sort @TheList; foreach $file (@SortedList) { print "$file \n"; @lost = split(/ /,$list{$file}); @sortlost = sort @lost; print " Referenced by:\n"; foreach $lostfile (@sortlost) { print " $lostfile\n"; } } } # sub Print_List sub Print_Lists { # Print lists # List all files found if (!$Silent) { &Print_List(%FileList,"Web documents found:");} # List of directories referenced if (!$Silent) { &Print_List(%DirList,"Directories:");} # List of images referenced if (!$Silent) { &Print_List(%ImageFileList,"Images:");} # List of mailto's if (!$Silent) { &Print_List(%MailList,"Mailto:");} # List of ftp's if (!$Silent) { &Print_List(%FTPList,"ftp:");} # List of telnets if (!$Silent) { &Print_List(%TelnetList,"telnet:");} # List of gophers if (!$Silent) { &Print_List(%GopherList,"gopher:");} # List of news if (!$Silent) { &Print_List(%NewsList,"News:");} # List of http's if (!$Silent) { &Print_List(%HTTPList,"External URLs:");} # List of file:'s if (!$Silent) { &Print_List(%ExtFileList,"External file:");} # List of cgi-bin scripts/forms if (!$Silent) { &Print_List(%CGIList,"cgi-bin scripts/forms:");} # List of name qualifs if (!$Silent) { &Print_List(%QualifList,"Name qualifs found:");} # List of files that can't be found &Print_List(%LostFileList,"Files not found:"); # List of files that are not world readable &Print_List(%UnreadableList,"Files not world readable:"); # List of directories that can't be found &Print_List(%DirNotFoundList,"Directories not found:"); # List of name qualifs not found &Print_List(%LostQualifList,"Name qualifs not found:"); if ($HTML_only) { print "\nDone.\n"; } } #sub Print_Lists # This is the last line of the webxref script really. # If this line is missi