#!/usr/bin/perl # This script is provided as-is, with no guarantees. Feedback is appreciated, # to sandy@clark.net. More details on it (and where it is used) are # available at http://www.rpg.net/help/rpgnet_logs.html # This script takes a large log file, extracts a given subset from it and # makes an intermediate log, then analyzes this log to produce two output # files (that are fairly self-explanatory). # The subset is whatever keyword you want to examine, typically a # subdirectory or subsite that you are looking for. The -all option lets # you analyze the entire log. # It uses an assessment of people based on the number of unique page # accesses, time-sorted, # This assumption uses a close time connection to register as a unique # person, i.e. a consecutive series of hits. It's only valid for small visit # rates (i.e. <5000 visitors per day) # Arguments (in order of usefulness): # -mysite = site to ignore when checking accesses (if you want to # ignore hits from the page designer, for example) # -syslog = name of the system log you are trying to analyze # The default is "./www.logs", and it will prompt for a new # name if it cannot find this. # -basepage = string to search for/subsection to analyze, i.e if you # are www.rpg.net and want to see hits in the "magic" # section, set "-basepage magic". The script will prompt # for this unless "-all" was specified. # -all = produce results for entire log file, not just one part of it # -homesite = site the web pages are on, basically a second "mysite" # variable to help screen self-hits. The idea is, many # designers work on a different site than where the pages are. # So if I design rpg.net pages and test it from clark.net, I # can set "-mysite clark.net" and "-homesite rpg.net", # and then the analysis results will not include my own # visits for testing and installation. # -logfile = name to use for intermediate log file, defaults to the # basepage + .html_log # -ignore = don't read the big log file, just work with an already-made # intermediate log file # -debug = produce copious on-screen debugging/status information # (-webloc ) = used in earlier versions, now redundant (and thus ignored) # new as of 11Mar97: better handling of small statistics # new as of 04Feb97: screens out cgibin and cgi-bin accesses # my hard-coded defaults for this script $syslog = "./www.logs"; # system-dependent location of html log $mysite = "ignore_this_for_now"; # site to ignore when checking accesses # (so you don't record your own hits) # this next batch actually gets reset later, interactively $basepage = 0; # what "." actually refers to $webloc = "all"; # location of web pages $logfile = "./html_log"; # name of file to store results $homesite = "bestware.net"; # site web pages are on (as opposed to the base # page or accessing site # nothing needs to be altered after this line, even for different users $ignore = 0; # lets you not read the system log # (for debugging, or to reiterate) $all = 0; # lets you do it for the whole site &getargs($ARGV); unless ($ignore eq 1) # make sure access log file exists { until (-e $syslog) { print "Error, cannot find logfile $syslog, please give a new name.\n"; print "System log file = ? "; $syslog = ; chop($syslog); } } if ($all eq 0 & $basepage eq 0) { print "Please enter basepage/web location (i.e. ehp, larp, etc): "; $basepage = ; chop($basepage); } elsif ($all eq 1) { $basepage="HOMEPAGE"; } $webloc = $basepage; # almost always the case $logfile = "./" . $basepage . ".html_log"; # name of file to store results $fileone = $basepage . ".log1"; $filetwo = $basepage . ".log2"; open(FILEONE,"> $fileone"); open(FILETWO,"> $filetwo"); if ($all eq 0) { # $hcall = "grep '$webloc' $syslog | grep -v 'gif' | grep -v 'jpg' | grep -v 'cgi-bin' | grep -v 'cgibin' | grep -v '$mysite' | cut -f1,2,4,7 -d' ' >$logfile"; $hcall = "cat $syslog | cut -f1,2,4,7 -d' ' | grep '$webloc' | grep -v 'gif' | grep -v 'jpg' | grep -v 'cgi-bin' | grep -v 'cgibin' | grep -v '$mysite' >$logfile"; } else { # $hcall = "grep -v 'gif' $syslog | grep -v 'jpg' | grep -v 'cgi-bin' | grep -v 'cgibin' | grep -v '$mysite' | cut -f1,2,4,7 -d' ' >$logfile"; $hcall = "cat $syslog | cut -f1,2,4,7 -d' ' | grep -v 'gif' | grep -v 'jpg' | grep -v 'cgi-bin' | grep -v 'cgibin' | grep -v '$mysite' >$logfile"; } if ($debug eq 1) { # echo the command first print "$hcall\n"; } if ($ignore eq 0) { # note option to just use html log in my directory print "Checking html accesses-- please wait, this takes some time.\n"; $syscall = `echo \"$hcall\" | /bin/csh -s`; # my prefered way of running 'grep' } print "Log of $webloc hits in $logfile, making summary.\n"; open(HFILE,$logfile); # open my subset of the access log file # (terms: access = hit, # session = group of hits by same site at same approx time) # initialize all the various counters and variables # (note that we reset these to zero below, also) $icount = 0; while ($line=) { # reads the entire log file &parseme($line); # returns $date, $site, $page # next line now filters out ip addresses that accidentally match # the desired page unless ($site =~ /$homesite/ || $site =~ /$basepage/) { $listdate[$icount] = $date; $listsite[$icount] = $site; $listpage[$icount] = $page; ++$icount; # records number of hits (total) } } close(HFILE); print "Sorting and printing results.\n"; # make array of unique pages, returning array arrpage, zeroed counters npage # and tpage, with total number of pages ipage &pagemake; # make array of unique sites, returning array arrsite, zeroed counters nsite # and tsite, with total number of sites isite &sitemake; # make monthly summary report print "There are $ipage unique pages, accessed by $isite unique sites.\n"; print FILETWO "There are $ipage unique pages, accessed by $isite unique sites.\n"; &pagesort; # prints out to filetwo close(FILETWO); print "Log of per-page results is now in $filetwo\n"; print "Making daily stat summary\n"; # do daily stats-- returns # of days analyzed (idate), and arrays dates[idate], # totalhits[idate] and datects[idate][ipage] with the counts per page per day, # and uniqhits[idate] (i.e. eliminates consecutive hits from 1 site to determine # number of people accessing), and deepsite[idate] (most number of pages hit # by a single site). All this is ready to be dumped to a plotting file # also, popular[idate] (most popular page) and pophits[idate] (times it was hit) &dailystat; $hits = 0; $people = 0; # $hits provides a check on $icount, just to be safe-- should be the same for ($i=0; $i<$idate; ++$i) { $hits = $hits + $totalhits[$i]; $people = $people + $uniqhits[$i]; } print FILEONE "$listdate[0] to $listdate[$icount-1]: $icount accesses ($isite sites, $people people) on $ipage pages.\n"; print FILEONE " Date Total People Greatest Most Page Hit Most\n"; print FILEONE " access depth visits/pg\n"; for ($i=0; $i<$idate; ++$i) { printf FILEONE ("%12s %3d %3d %3d %3d %s\n", $dates[$i],$totalhits[$i],$uniqhits[$i],$deepsite[$i],$pophits[$i],$popular[$i]); } close(FILEONE); print "Log of daily hits is now in $fileone\n"; # end of script sub parseme { # expects to be handling $line $line =~ s/\+//g; # gets rid of the annoying problem with + signs ($junk,$protodate) = split('\[',$line); # get new date $date=substr($protodate,0,11); # (just the DD/MMM/YYYY part) ($site,$junk) = split(' ',$line); # determine the site that hit ($junk,$page)=split(' ',$protodate); # everything after the date if ($page =~ '\/') {$page = substr($page,1); } # cut leading / $temppage = $basepage . "/"; # base pagename with a trailing / if ($page eq $temppage) { # essentially, X versus X/ $page = $basepage; # so make them the same } $temppage = $basepage . "/" . $basepage . ".html"; # now do for X/X.html if ($page eq $temppage) { # essentially, X versus X/ $page = $basepage; # so make them the same } $temppage = $basepage . "/"; # remove initial X/ from all entries now $length = length($temppage); if (substr($page,0,$length) eq $temppage) { $page = substr($page,$length); } if (length($page) eq 0) # indicates my home page as "." or "/" { $page = $basepage; } # so set it to the basepage } sub getargs { $numargs = $#ARGV + 1; # gets the number of command line arguements for ($i=0; $i < $numargs; ++$i) { # and looks at each additional parameter if ($ARGV[$i] eq "-help" or $ARGV[$i] eq "-h") { print "checkhtml [-ignore] -webloc ? -syslog ? -logfile ? -mysite ? -basepage -homesite -all?\n"; exit 0; } elsif ($ARGV[$i] eq "-i" or $ARGV[$i] eq "-ignore") { $ignore = 1; } # skips the system call that actually makes the log elsif ($ARGV[$i] eq "-d" or $ARGV[$i] eq "-debug") { $debug = 1; } elsif ($ARGV[$i] eq "-h" or $ARGV[$i] eq "-homesite") { $homesite = 1; } elsif ($ARGV[$i] eq "-a" or $ARGV[$i] eq "-all") { $all = 1; } elsif ($ARGV[$i] eq "-l" or $ARGV[$i] eq "-logfile") { $logfile = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-w" or $ARGV[$i] eq "-webloc") { $webloc = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-s" or $ARGV[$i] eq "-syslog") { $syslog = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-m" or $ARGV[$i] eq "-mysite") { $mysite = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-b" or $ARGV[$i] eq "-basepage") { $basepage = $ARGV[$i+1]; } } # closes my 'get command line arguments' loop } sub pagemake { $ipage = 0; $pageID = 0; foreach $me (sort(@listpage)) # non-destructively sorts { if ($pageID =~ /$me/) { ++$npage[$ipage-1]; } else { $arrpage[$ipage] = $me; $npage[$ipage] = 1; # total number of hits $tpage[$ipage] = 0; # daily number of hits $pageID = $me; ++$ipage; } } } sub sitemake { $isite = 0; $siteID = 0; foreach $me (sort(@listsite)) # non-destructively sorts { if ($siteID =~ /$me/) { ++$nsite[$isite-1]; } else { $arrsite[$isite] = $me; $nsite[$isite] = 1; # total number of hits $tsite[$isite] = 0; # daily number of hits (empty for now) $siteID = $me; ++$isite; } } } sub dailystat { $idate = 0; $prevdate = $listdate[0]; $prevsite = 0; $deeper = 0; for ($i=0;$i<$icount;++$i) { $date = $listdate[$i]; $site = $listsite[$i]; if ($debug eq 1) { # echo each line's date print "$i $date\n"; # nice little debug here } # added the second bit ($i vs $icount) to force printing of last day! if ($i eq $icount-1) {$date = 0;} # flush last entry unless ($date =~ /$prevdate/) { # reset to zero # note this next bit is in twice-- once for changes within a day, # then also here for changes at the day boundary if ($deepsite[$idate] < $deeper) { $deepsite[$idate] = $deeper; # record most number of pages hit } $dates[$idate] = $prevdate; $ipop = 0; $ipoploc = 0; for ($j=0;$j<$ipage;++$j) { # $datepagects[$idate][$j] = $tpage[$j]; $totalhits[$idate] = $totalhits[$idate] + $tpage[$j]; if ($tpage[$j] > $ipop) { $ipop = $tpage[$j]; $ipoploc = $j; } $tpage[$j] = 0; } $popular[$idate] = $arrpage[$ipoploc]; $pophits[$idate] = $ipop; ++$idate; $prevdate = $date; $deeper = 1; } # assuming we're on a new day if ($prevsite eq $site) { # is not a unique access $addme = 0; $deeper = $deeper + 1; } else { $addme = 1; $prevsite = $site; if ($deepsite[$idate] < $deeper) { $deepsite[$idate] = $deeper; # record most number of pages hit } $deeper = 1; } $uniqhits[$idate] = $uniqhits[$idate] + $addme; for ($j=0;$j<$ipage;++$j) { if ($arrpage[$j] eq $listpage[$i]) { ++$tpage[$j]; } } } } sub pagesort { # make line for each entry sub bynumber { $a <=> $b; } $prevhit = (-1); $j = 0; print FILETWO "Pages hit during $listdate[0] to $listdate[$icount-1]\n"; foreach $me (sort bynumber (@npage)) { # in order of frequency hit if ($debug eq 1) { # echo each line's date print "$me\n"; # lame little debug here } unless ($me eq $prevhit) { # only do one print per frequency $i = length($me); # how many digits? for ($q=$i;$q<4;++$q) { # force to be 4 spaces $line[$j] = " " . $line[$j]; } $line[$j] = $line[$j] . $me . " times: "; for ($i=0;$i<$ipage;++$i) { if ($npage[$i] eq $me) { $line[$j] .= $arrpage[$i] . " "; } } $prevhit = $me; $line[$j] .= "\n"; ++$j; } } if ($debug eq 1) { # echo each line's date print "debug checkpoint-- now reformatting.\n"; } # now reformat to nice and neat 80 columns (in a kludgy way) foreach $element (@line) { if ($debug eq 1) { # echo each line's date $mystr=" " . $element . " " . length($element) . "\n"; print $mystr; } if (length($element) > 80) { $iflag = 0; $j = 0; for ($i=0;$i