#! /usr/bin/perl -w # Last edited on 2008-02-04 20:14:05 by stolfi use strict; # Scans the Apache server logs for a specific regexp # and sorts the results by file, date, or requesting IP # Each DATE must be in the format yyyy/mm/dd (zero-filled). my($usage) = "$0 [ -sort {date|file|host|size} ] REGEXP YYYY/MM/DD..."; my($maxbytes) = 2000000; my(@month_name) = ( "Pop", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ); my(%month_number) = ( "Jan" => "01", "Feb" => "02", "Mar" => "03", "Apr" => "04", "May" => "05", "Jun" => "06", "Jul" => "07", "Aug" => "08", "Sep" => "09", "Oct" => "10", "Nov" => "11", "Dec" => "12" ); my(@today) = todays_date(); my($logdir) = "/home/staff/stolfi/www/logs" # prototypes sub push_date(\%@); sub todays_date(); sub inc_date(@); sub dec_date(@); sub month_days($$); sub arg_error($); sub fatal_error($); sub warning($); sub main() { # Go to apache log directory # chdir("${STOLFIHOME}/www/logs"); # Parse options my($sortkeys) = "+3 -4 +1 -3"; # print STDERR "\@ARGV = @ARGV\n"; while ((scalar(@ARGV) > 0) && ($ARGV[0] =~ /^-/)) { if ((scalar(@ARGV) >= 2 ) && ("$ARGV[0]" eq "-sort")) { if ( "$ARGV[1]" eq "host" ) { $sortkeys = "+0 -1 +1 -3" } elsif ( "$ARGV[1]" eq "date" ) { $sortkeys = "+1 -3 +3 -4" } elsif ( "$ARGV[1]" eq "file" ) { $sortkeys = "+3 -4 +1 -3" } elsif ( "$ARGV[1]" eq "size" ) { $sortkeys = "+5 -6nr +3 -4" } else { arg_error("bad sorting option \"" . $ARGV[1] . "\""); } shift(@ARGV); shift(@ARGV); } else { arg_error("bad option \"" . $ARGV[0] . "\""); } # print STDERR "\@ARGV = @ARGV\n"; } # Get regexp: my($regexp); if (scalar(@ARGV) >= 1 ) { $regexp = shift(@ARGV); } else { arg_error("missing REGEXP"); } # print STDERR "\$regexp = \"${regexp}\"\n"; # Get dates: my(@dates) = @ARGV; if (scalar(@dates) == 0 ) { arg_error("missing DATES"); } # print STDERR "\@dates = (@{dates})\n"; # Map dates to file names: my(%fileset) = (); my($datefile) = "/tmp/$$.dates"; open(FGREP, ">${datefile}"); foreach my $dt (@dates) { my(@dtf); if ( $dt =~ m:^([0-9][0-9][0-9][0-9])[/]([0-9][0-9])[/]([0-9][0-9])$: ) { my(@dtf) = ($1,$2,$3); push_date(%fileset,@dtf); printf FGREP "[%02d/%s/%04d:\n", $dtf[2], $month_name[$dtf[1]], $dtf[0]; # Since the log file splitting is not perfect, we must scan also # the log files of adjacent days: my(@b) = inc_date(@dtf); push_date(%fileset,@b); my(@c) = dec_date(@dtf); push_date(%fileset,@c); } else { arg_error("bad date \"" . $dt . "\""); } } close(FGREP); # print STDERR "\%fileset = (" . join(",", keys(%fileset)) . ")\n"; # Check for existing files: my(@catfiles) = (); my(@bz2files) = (); foreach my $f (sort(keys(%fileset))) { if (! -f "$f") { warning("log file \"" . $f . "\" does not exist"); } elsif ($f =~ m/[.]bz2/) { push(@bz2files,$f); } else { push(@catfiles,$f); } } # print STDERR "\@bz2files = (" . join(",", @bz2files) . ")\n"; # print STDERR "\@catfiles = (" . join(",", @catfiles) . ")\n"; if (scalar(@bz2files) + scalar(@catfiles) == 0) { fatal_error("no log files found"); } @ARGV = (); if (scalar(@catfiles) > 0) { push(@ARGV, "/bin/cat @{catfiles} | fgrep -f ${datefile} |"); } # print STDERR "\@ARGV = @ARGV\n"; if (scalar(@bz2files) > 0) { push(@ARGV, "bzip2 -dc @{bz2files} | fgrep -f ${datefile} |"); } # print STDERR "\@ARGV = @ARGV\n"; study($regexp); my($totbytes,$errmsg,$lin,$host,$rtime,$file,$err,$size); my($tempfile) = "/tmp/$$.log"; open(TEMP, ">${tempfile}"); LINE_LOOP: while (<>) { $lin = $_; $lin =~ s/%7[Ee]/~/g; if ($lin =~ $regexp) { if ($lin =~ m:^([^ ]+) - - (\[[^ ]* [-+][^ ]*\]) (["].*["]) ([-0-9]+) ([-0-9]+)$:) { # Extract the interesting fields: $host = $1; $rtime = $2; $file = $3; $err = $4; $size = $5; # Clean up the date and convert to sortable format: if ($rtime =~ m=^\[([0-9]+)/([A-Z][a-z][a-z])/([0-9]+):([^ ]+) *([^ ]+)\]$=) { $rtime = ($3."/".$month_number{$2}."/".$1." ".$4."(".$5.")"); } else { warning("bad date \"${rtime}\" in log file"); } $file =~ s:^"GET[ ]+::; $file =~ s:[ ]+http[/ ]*[0-9][.][0-9]"$::i; $file =~ s:[ ]:%20:g; # Build the output record: $lin = join(" ", $host, $rtime, $file, $err, $size); print TEMP $lin . "\n"; $totbytes += length($lin); if ($totbytes > $maxbytes) { warning("output too big, truncated"); last LINE_LOOP; } } else { fatal_error("bad log format «${lin}»"); } } } close(STDIN); close(TEMP); exec("sort ${sortkeys} ${tempfile} ; /bin/rm -f ${tempfile} ${datefile}"); } sub todays_date() { my(@t) = localtime(time()); return (sprintf("%04d", $t[5]+1900), sprintf("%02d", $t[4]+1), sprintf("%02d", $t[3])); } sub inc_date(@) { my($y,$m,$d) = @_; $d = sprintf("%02d", $d+1); if ($d > month_days($y,$m)) { $d = "01"; $m = sprintf("%02d", $m+1); if ($m > 12) { $m = "01"; $y = sprintf("%04d", $y+1); } } return ($y,$m,$d); } sub dec_date(@) { my($y,$m,$d) = @_; $d = sprintf("%02d", $d-1); if ($d < 1) { $d = month_days($y,$m); $m = sprintf("%02d", $m-1); if ($m < 1) { $m = "12"; $y = sprintf("%04d", $y-1); } } return ($y,$m,$d); } sub month_days($$) { my($y,$m) = @_; if ($m == 2) { if ($y == 2000) { return 29; } else { return 28; } } elsif (($m == 4) || ($m == 6) || ($m == 9) || ($m == 11)) { return 30; } else { return 31; } } sub push_date(\%@) { my($rfs,$y,$m,$d) = @_; # print STDERR "push_date ($y,$m,$d)\n"; # Get the compressed log file: $$rfs{"${logdir}/${y}/${m}/access_log.${d}.bz2"} = 1; # Get also the current access log, if the date is today: if (($y == $today[0]) && ($m == $today[1]) && ($d == $today[2])) { $$rfs{"${logdir}/access_log"} = 1; } } sub arg_error($) { my($msg) = @_; print STDERR "$msg\n"; print STDERR "usage: $usage\n"; die("aborted"); } sub fatal_error($) { my($msg) = @_; print STDERR "$msg\n"; die("aborted"); } sub warning($) { my($msg) = @_; print STDERR "warning: $msg\n"; } main();