#! /usr/bin/perl # Last edited on 2003-10-12 15:30:31 by stolfi # webxref # ------ # Webxref is a WWW link checker and cross referencing tool, intended # to quickly check a local set of HTML documents for missing files, # anchors etc. You simply call webxref with a HTML document as the parameter. # Webxref compiles a list of HTML documents, URLs, name anchors, images etc # and the html files that reference those, i.e. a cross-reference list. # # usage: webxref [-nohttp -htmlonly -avoid regexp] file.html # # -nohttp tells webxref *not* to check http:// URLs via the network # # -htmlonly tells webxref to *only* inspect files with the .html # extension for further links. # # -silent tells webxref to only output error messages and keep quiet # about things that are ok. # # -avoid regexp: when regexp matches a filename/URL/... the item is # not further inspected. Make sure this parameter is # supplied in quotes, else the shell will interpret various # characters like *,$,.,?,... # # Examples # webxref file.html # checks file.html and files/URLs referenced from file.html # webxref -nohttp file.html # checks file.html, but not external URLs # webxref -htmlonly file.html # checks file.html, but no files without .html extension # webxref -avoid '.*Archive.*' file.html # checks file.html but avoids files with names containing # 'Archive' # webxref -avoid '.*Archive.*|.*Distribution.*' file.html # Same as above, but also files with names containing # 'Distribution' are skipped. # # Lists are made of: # - html files # - directories # - binary files (images) # - named anchors # - mailto's # - news # - ftp # - telnet # - gopher # - external URLs # - cgi-bin scripts # - file:'s # - files that can't be found # - files that are not world readable # - directories that can't be found # - name anchors that can't be found # - http:// ok references # - http:// failed references # # Written July 1995 by Rick Jansen at SARA as part of the SURFACE project # (SURFnet Advanced Communication Environment) # email: rick@sara.nl # url: http://www.sara.nl/Rick.Jansen # # 11-JUL-95 lcheck version 0.0.0 # 18-JUL-95 renamed webxref 0.0.1 # 20-JUL-95 webxref 0.0.2 # 21-JUL-95 webxref 0.0.3 root handling # 27-JUL-95 webxref 0.0.4 metachar interpretation in substitutions fixed # 28-JUL-95 webxref 0.0.5 pass associative array to sub # 08-AUG-95 webxref 0.0.6 parsing with temp file # 08-AUG-95 webxref 0.0.7 handle Welcome/welcome/index.html in case of dir # 08-AUG-95 webxref 0.0.8 'file:' refs # 10-AUG-95 webxref 0.0.9 Extensible default_files # 14-AUG-95 webxref 0.1.0 Some perl lint removed, cgi-bin added # 28-SEP-95 webxref 0.1.1 1-level external URL checking added # 04-OCT-95 webxref 0.1.2 options -nohttp -htmlonly and -avoid added # 04-OCT-95 webxref 0.1.3 Restriction on tags not being allowed to spread # over more than 1 source line removed, thanks to # Hans Hoppe (hopha@sci.kun.nl) # 10-OCT-95 webxref 0.1.4 -silent option # 15-APR-96 webxref 0.1.5 Temporary fix for $SOCK_STREAM # # # New versions can be obtained from: # http://www.sara.nl/Rick.Jansen #--------------------------------------------- # Configurable things: # Files to try in case of a directory reference like ../.. @default_files = ('Welcome.html','welcome.html','index.html', 'index.shtml','README.html'); #--------------------------------------------- $debug = 0; $temp_file = "/tmp/webxref.$$"; #--------------------------------------------- # HTTP status codes and messages %OkStatusMsgs = ( 200, "OK 200", 201, "CREATED 201", 202, "Accepted 202", 203, "Partial Information 203", 204, "No Response 204", ); %FailStatusMsgs = ( -1, "Could not lookup server", -2, "Could not open socket", -3, "Could not bind socket", -4, "Could not connect", 301, "Found, but moved", 302, "Found, but data resides under different URL (add a /)", 303, "Method", 304, "Not Modified", 400, "Bad request", 401, "Unauthorized", 402, "PaymentRequired", 403, "Forbidden", 404, "Not found", 500, "Internal Error", 501, "Not implemented", 502, "Service temporarily overloaded", 503, "Gateway timeout ", 600, "Bad request", 601, "Not implemented", 602, "Connection failed (host not found?)", 603, "Timed out", ); #--------------------------------------------- # Process parameters $Do_External_URLs = 1; # Default we do check external URLs $HTML_only = 0; # If 0, referenced files are checked for links # even if the file has no .html extension $Avoid = ""; # Regexp to avoid certain URLs, files,... $Silent = 0; # If silent=1 only error msgs will be printed $InFile = ""; while ($ARGV[0] =~ /^-/) { if ($ARGV[0] eq "-help") {&PrintHelp;} elsif ($ARGV[0] eq "-nohttp") {$Do_External_URLs = 0; } elsif ($ARGV[0] eq "-htmlonly") {$HTML_only = 1; } elsif ($ARGV[0] eq "-silent") {$Silent = 1; } elsif ($ARGV[0] eq "-avoid") { shift; $Avoid = $ARGV[0]; print "Avoided: $Avoid\n"; } else {&PrintUsage;} shift; } $InFile = $ARGV[0]; if ($InFile eq "") { print "No input file.\n"; exit; } #--------------------------------------------- # Does the file exist at all? stat($InFile); die "Cannot find file $InFile\n" unless -e $InFile; #--------------------------------------------- if ($debug) { print "=======================\n"; print "\n input file: $InFile\n"; } # Assume webxref is called in the document root directory $root = `pwd`; chop($root); if (!$Silent) { print "\nChecking $InFile\n\n"; } &Get_Refs($InFile,""); &Print_Lists; # Check external URLs if ($Do_External_URLs) { if (! $Silent) { print <<"E_O_T"; - - - - - - - - - - - - - - - - - - - - - - - - - - - Going to really check external URLs via the network. This may take some time. Simply abort webxref if you are out of patience. - - - - - - - - - - - - - - - - - - - - - - - - - - - E_O_T } &Check_External_URLs(%HTTPList, "Checking external URLs:"); print "\nAll done.\n"; } exit; sub PrintUsage { print <<"E_O_T"; Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html E_O_T exit; } sub PrintHelp { print <<"E_O_T"; Usage: webxref -help -nohttp -htmlonly -silent -avoid regexp file.html -nohttp: do not check external URLs -htmlonly: only inspect files with the .html extension -silent: only output error/problem messages -avoid regexp: avoid files with names matching regexp for inspection Examples webxref file.html checks file.html and files/URLs referenced from file.html webxref -nohttp file.html checks file.html, but not external URLs webxref -htmlonly file.html checks file.html, but only files with the .html extension webxref -avoid '.*Archive.*' file.html checks file.html but avoids files with names containing 'Archive' webxref -avoid '.*Archive.*|.*Distribution.*' file.html Same as above, but also files with names containing E_O_T exit; } #--------------------------------------------- sub Get_PWD { # Get the pwd, make sure it ends with a slash local($dir); $dir = `pwd`; $dir =~ s/\n//g; if (!($dir =~ m#.*/$#)) { $dir = "$dir/"; } return $dir; } sub Get_Refs { # Recursively get all referenced files a from the file local(%newlist); local($file); local($dir); local($Old_Dir); local($filename); $dir=&Dir_Name($_[0]); if ($dir eq "") { $dir = &Get_PWD; } $file=&Base_Name($_[0]); #print "--------------------\n"; if ($debug) { print "arg=$_[0]\n"; print "dir=$dir\n"; print "file=$file\n"; } # http? if ($_[0] =~ m/.*(http:.*)/i) { if (!defined($HTTPList{$1})) { $HTTPList{$1} = $_[1]; } else { $HTTPList{$1} = "$HTTPList{$1} $_[1]"; } return; } # ftp? if ($_[0] =~ m/.*(ftp:.*)/i) { if (!defined($FTPList{$1})) { $FTPList{$1} = $_[1]; } else { $FTPList{$1} = "$FTPList{$1} $_[1]"; } return; } # telnet? if ($_[0] =~ m/.*(telnet:.*)/i) { if (!defined($TelnetList{$1})) { $TelnetList{$1} = $_[1]; } else { $TelnetList{$1} = "$TelnetList{$1} $_[1]"; } return; } # gopher? if ($_[0] =~ m/.*(gopher:.*)/i) { if (!defined($GopherList{$1})) { $GopherList{$1} = $_[1]; } else { $GopherList{$1} = "$GopherList{$1} $_[1]"; } return; } # mailto? if ($_[0] =~ m/.*(mailto:.*)/i) { if (!defined($MailList{$1})) { $MailList{$1} = $_[1]; } else { $MailList{$1} = "$MailList{$1} $_[1]"; } return; } # news? if ($_[0] =~ m/.*(news:.*)/i) { if (!defined($NewsList{$1})) { $NewsList{$1} = $_[1]; } else { $NewsList{$1} = "$NewsList{$1} $_[1]"; } return; } # file:? if ($_[0] =~ m/.*(file:.*)/i) { if (!defined($ExtFileList{$1})) { $ExtFileList{$1} = $_[1]; } else { $ExtFileList{$1} = "$ExtFileList{$1} $_[1]"; } return; } # cgi-bin script? if ($_[0] =~ m#(^/cgi-bin/.*)#i) { $_[0] =~ m#(^/cgi-bin/.*)=.*#i; # Delete cgi-parameters if (!defined($CGIList{$1})) { $CGIList{$1} = $_[1]; } else { $CGIList{$1} = "$CGIList{$1} $_[1]"; } return; } # directory reference? if ($file eq "") { if ($debug) { print "$dir must be a dir, refd by $_[1]!\n"; } if (-d $_[0]) { if (!defined($DirList{$_[0]})) { $DirList{$_[0]} = $_[1]; } else { $DirList{$_[0]} = "$DirList{$_[0]} $_[1]"; } } else { if (!defined($DirNotFoundList{$_[0]})) { $DirNotFoundList{$_[0]} = $_[1]; } else { $DirNotFoundList{$_[0]} = "$DirNotFoundList{$_[0]} $_[1]"; } } return; } # Move to the specified directory $Old_Dir = &Get_PWD; if ($debug) { print "Chdir to $dir\n"; } chdir($dir); $dir=&Get_PWD; if ($debug) { print "Now in $dir\n"; } $filename = $dir . $file; if (! $Silent) { print "Checking: $filename\n"; } # Is it a reference to a specific section? (a file#section reference) if ($filename =~ m/(.+)#(.+)/) { $filename = "$1#$2"; if (&CheckAnchor($1, $2) ) { #print "** Anchor $2 is present in file $1\n"; # Add to the list of anchors if (!defined($AnchorList{$filename})) { $AnchorList{$filename} = $_[1]; } else { $AnchorList{$filename} = "$AnchorList{$filename} $_[1]"; } } else { print "xx Anchor $2 is NOT present in file $1\n"; print "xx Referenced by: $_[1]\n"; #print "Anchor filename: $filename\n"; # Add to the list of lost anchors if (!defined($LostAnchorList{$filename})) { $LostAnchorList{$filename} = $_[1]; } else { $LostAnchorList{$filename} = "$LostAnchorList{$filename} $_[1]"; } } return; } # # Add to the list of already tested files # # If the "file" is a directory try Welcome/welcome/index.html if (-d $filename) { #print "xx $filename is a directory, trying Welcome/welcome/index.html.\n"; $found = 0; foreach $default_file (@default_files) { #print "Trying $default_file\n"; if (-f ($file . '/' . $default_file)) { $dirname=$filename; $file= $default_file; $found = 1; last; } } if (! $found) { print "xx No Welcome/welcome/index.html can be found in $filename\n"; print "xx Referenced by: $_[1]\n"; # Add to list of lost files if (!defined($LostFileList{$filename})) { $LostFileList{$filename} = $_[1]; } else { $LostFileList{$filename} = "$LostFileList{$filename} $_[1]"; } return; } # Move to the specified directory if ($debug) { print "Chdir to $dirname\n"; } chdir($dirname); $dir=&Get_PWD; if ($debug) { print "Now in $dir\n"; } $filename = $dir . $file; if ($debug) { print "** Filename is now: $filename\n"; print "** Dirname is now: $dir\n"; } } if (! -f $filename) { print "xx $filename cannot be found\n"; print "xx Referenced by: $_[1]\n"; # Add to list of lost files if (!defined($LostFileList{$filename})) { $LostFileList{$filename} = $_[1]; } else { $LostFileList{$filename} = "$LostFileList{$filename} $_[1]"; } return; } # Binary file? (pictures,...) if (-B $filename) { if ($debug) { print "** Binary file added to images"; } if (defined($ImageFileList{$filename})) { return; } if (!defined($ImageFileList{$filename})) { $ImageFileList{$filename} = $_[1]; # Define! } else { $ImageFileList{$filename} = "$ImageFileList{$filename} $_[1]"; } if ($debug) { print "\n\nAdded: $filename to list of images\n"; } return; } # else it's a text (html)file if (!defined($FileList{$filename})) { $FileList{$filename} = $_[1]; # Define! } else { $FileList{$filename} = "$FileList{$filename} $_[1]"; return; # Already did this file } if ($debug) { print "** Added: $filename \n"; } # World readable? ($_,$_,$mode) = stat($filename); $readmode = ($mode & 4); if ($readmode == 0) { # Not world readable, add to list #print "xx Warning: $filename is not world readable\n"; if (!defined($UnreadableList{$filename})) { $UnreadableList{$filename} = $_[1]; } else { $UnreadableList{$filename} = "$UnreadableList{$filename} $_[1]"; } } if ($HTML_only) { # Filename *must* have extension .html, else we don't inspect it. if ($filename !~ /.*\.html$/i) {return;} } # Apply the regexp to avoid certain files if ($Avoid ne "") { if ($filename =~ m/$Avoid/) { print "** The above file is avoided.\n"; return; } } $err = 0; open(HTML, $filename) || ($err = 1); if ($err) { print "xx Could not open file $filename\n"; return; } # Make sure all $temp_file") || die "Could not create $temp_file\n"; #while () { # s/$temp_file") || die "Could not create $temp_file\n"; $offset=0; do { $size=read(HTML,$html_text,32768,$offset); $offset=$offset+$size; } until $size != 32768; close(HTML); #$html_text =~ s/\n/ /gs; $html_text =~ s/\n/ /g; $html_text =~ s/[^<]*//; $html_text =~ s/(<[^>]*>)[^<]*/\1\n/g; print TEMP "$html_text"; $html_text=""; close(TEMP); open(HTML, $temp_file) || die "Could not open $temp_file\n"; while () { chop; # .*//; # trailing stuff s/>.*$//i; # remove >'s s/"//g; # Unquote file names s/^\s*//; # Remove spaces at start s/\s*$//; # Remove spaces at end # Link to section within current document? if (m/^#.*/) { $file_w_anchor = $filename; $file_w_anchor =~ s#.*/##; if ($debug) { print "file_w_anchor: $file_w_anchor\n"; print "Added to newlist: $file_w_anchor$_\n"; } $newlist{"$file_w_anchor$_"} = 1; # Check this file plus anchor later } # Link to another document? else { if ($debug) { print "added to newlist: $_\n"; } $newlist{$_} = 1; } } # .*//; s/>.*$//i; # remove >'s s/"//g; # Unquote file names s/^\s*//; # Remove spaces at start s/\s*$//; # Remove spaces at end # Add file to the list $newlist{$_} = 1; } } close(HTML); chdir($Old_Dir); if ($debug) { # List files print "\nNewlist:\n"; foreach $file (keys(%newlist)) { print "$file \n"; } } # Walk the list foreach $file (keys(%newlist)) { # if file is //something insert a http: if ($file =~ m#^//.*#) { $file = "http:" . $file; } $Notlocal_file = $dir . $file; # If file is /something it's a reference from the root document. # It can also be a cgi-bin reference! if ($file =~ m#^/cgi-bin/.*#) { $Notlocal_file = $file; } elsif ($file =~ m#^/.*#) { $Notlocal_file = "$root$file"; } $Notlocal_ref_filename = $filename; if ($debug) { print "\nCalling GR with $Notlocal_file\n"; print "Referenced by: $Notlocal_ref_filename\n"; } &Get_Refs($Notlocal_file, $Notlocal_ref_filename); } unlink($temp_file); } #sub Get_Refs #--------------------------------------------- sub Base_Name { # return basename, # e.g. /home/sscprick/.WWW/Welcome.html # returns: Welcome.html local($local_filename)=$_[0]; $local_filename =~ s#.*/##; # remove the directory name -> file name $local_filename; } sub Dir_Name { # return dirname, # e.g. /home/sscprick/.WWW/Welcome.html # returns: /home/sscprick/.WWW/ local($local_filename)=$_[0]; $local_filename =~ s#.*/##; # remove the directory name -> file name local($local_dirname) = $_[0]; $local_filename =~ s/(\W)/\\$1/g; # escape regexp chars $local_dirname =~ s/$local_filename$//; # wipe filename at end -> dir name $local_dirname; } sub CheckAnchor { # See if #section anchor is present in file local($fn, $anchor) = @_; $anchor =~ s/(\W)/\\$1/g; # quote rexep chars open(CH_HTML, $fn) || die "xx Could not open $fn\n"; while () { chop; if (/\n URL: $URL\n host: $host\n port: $port\n path: $path\n"; } # The following is largely taken from the Camel book, chapter 6 $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; chop($hostname = `hostname`); ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/; ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) { return -1; } $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); # Make the socket filehandle. # ** Temporary fix, this is NOT The way to do it. 15-APR-96 if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { $SOCK_STREAM = 2; if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; } } # Give the socket an address if (!(bind(S, $this))) { return -3; } if (!(connect(S,$that))) { return -4; } select(S); $| = 1; select(STDOUT); print S "HEAD $path HTTP/1.0\n\n"; $response = ; ($protocol, $status) = split(/ /, $response); while () { #print; } close(S); #print "Status: $status\n"; return $status; } #--------------------------------------------- sub Print_List { local(%list, $header) = @_; local($file); # Don't list empty lists if (! %list) {return}; print "\n\n----------------\n$header\n"; @TheList=keys(%list); @SortedList = sort @TheList; foreach $file (@SortedList) { print "$file \n"; @lost = split(/ /,$list{$file}); @sortlost = sort @lost; print " Referenced by:\n"; foreach $lostfile (@sortlost) { print " $lostfile\n"; } } } # sub Print_List sub Print_Lists { # Print lists # List all files found if (!$Silent) { &Print_List(%FileList,"Web documents found:");} # List of directories referenced if (!$Silent) { &Print_List(%DirList,"Directories:");} # List of images referenced if (!$Silent) { &Print_List(%ImageFileList,"Images:");} # List of mailto's if (!$Silent) { &Print_List(%MailList,"Mailto:");} # List of ftp's if (!$Silent) { &Print_List(%FTPList,"ftp:");} # List of telnets if (!$Silent) { &Print_List(%TelnetList,"telnet:");} # List of gophers if (!$Silent) { &Print_List(%GopherList,"gopher:");} # List of news if (!$Silent) { &Print_List(%NewsList,"News:");} # List of http's if (!$Silent) { &Print_List(%HTTPList,"External URLs:");} # List of file:'s if (!$Silent) { &Print_List(%ExtFileList,"External file:");} # List of cgi-bin scripts/forms if (!$Silent) { &Print_List(%CGIList,"cgi-bin scripts/forms:");} # List of name anchors if (!$Silent) { &Print_List(%AnchorList,"Name anchors found:");} # List of files that can't be found &Print_List(%LostFileList,"Files not found:"); # List of files that are not world readable &Print_List(%UnreadableList,"Files not world readable:"); # List of directories that can't be found &Print_List(%DirNotFoundList,"Directories not found:"); # List of name anchors not found &Print_List(%LostAnchorList,"Name anchors not found:"); if ($HTML_only) { print "\nDone.\n"; } } #sub Print_Lists # This is the last line of the webxref script really. # If this line is missi