#!/bin/sh # # roll_web_logs. Author: Tony Sanderson, Bluehaze Solutions, 3/05/2000 # # NB: This 'file' is a "link" to the 'real thing' on this server. # --------------------------------------------------------------- # # A shell to allow "webalizer" to run in direct (non incremental) mode # without taking an exponentially increasing amount of time to do it as # the server logs build up month by month. # # Basically, it just shuffles the apache/realmedia logs around a bit to # minimise the amount of crunching that "webalizer" and "logresolve" # need to do. # --------------------------------------------------------------------- # # HISTORY: # # Add 30 sec sleep after restarting apache # # Jan 21, 2001 - Improve the accuracy of these comments slightly :-) # # Dec 2, 2000 (major change) - added extra code to move the resolved archive # files out of the way at the start of each month (into a further sub-dir), # so webalizer only has to process a maximum of 1 month of resolved data. # Previously, these were building up indefinitely, creating an ever-increasing # load. # # Oct 30th 2000 - added extra code at the top to kludge in the RealMedia # server log (the kbyte figures seem to be rubbish but the hits may be ok) # # May 2nd 2000 - modify stop/start of web server to use "reload" instead of # actually stopping and re-starting. (From apache doco) # --------------------------- # # Synopsis: # A wrapper for web log analysis programs such as webalizer. Allows one to # run webalizer in full ("Incremental no") mode without taking 3 weeks to do # the requisite resolving. # # Description of operation: # # (a) Resolves IP numbers in the current web logs, # (b) Appends these to the existing resolved logs, if any # (c) Processes these accumulated resolved logs with webalizer # (d) Renames existing log(s), reloads web server, deletes the renamed logs. # # NOTE: These accumulated resolved logs (which webalizer processes) are now # "rolled away" into a sub-dir each month to ensure that a maximum of one # month's data needs processing - see history above (Dec 2, 2000). # ---------------------------------- # # This script creates a noisy trace of everything it does, so make sure that # STDOUT and STDERR are directed to a suitable log if running via cron. # # My cron entry looks is currently: # 0 0,3,6,9,12,15,18,21 * * * /usr/local/sys/roll_web_logs > /usr/local/webalizer/Log 2>&1 # ---------------------------------- # # Directories used (as defined by constants further below) are: # # LOG DATA # # /usr/local/apache/logs # This is the default directory where apache keeps it logs. Normally, these # logs are NOT resolved. # # /usr/local/apache/logs//_ # The accumulated (unresolved) logs from above. Not used for anything # normally, just archived here in case we ever do need them. # # /usr/local/apache/logs/webalizer/ # The resolved result(s) of the current apache log(s). is the first # part of the site IP name. For a simple server, this will typically just # be "www". If you have multiple access log files (due to, eg: the use of # "virtual hosts") there will normally be one file for each virtual host. # File(s) normally quite small if this script is run at least once a day. # # /usr/local/apache/logs/webalizer/archives # The monthly-accumulated resolved log files from above. These are the actual # files which webalizer processes. # # /usr/local/apache/logs/webalizer/archives/done # The accumulated resolved log files from above. The above monthly files are # concatenated onto these indefinitely. Emergency backup only - sometimes # useful when things go "bang" when making major changes to this script :-) # May be deleted or pruned if and as desired. # # CONFIG DATA (WEBALIZER) # # /usr/local/webalizer/ # Directory(s) for webalizer config and other info for each site name. # Again - here will be the first part of the site IP name. If you're # running multiple host names (eg: apache's "virtual hosts"), then you will # almost certainly need multiple matching directories, each with its # own private "webalizer.conf" file. # # Note that these same directories normally also take all the OUTPUT data # (HTML and GIF/PNG files) which comprise the pages that you'll access for # seeing your stats. Mind you ... you don't HAVE to do it that way. # --------------------------------------------------------------------------- # Define a few constants: BIN=/usr/local/bin # This is where my webalizer executable lives (default). #BIN=/usr/bin # RedHat executable ABIN=/usr/local/apache/bin # Dir where my apache "logresolve" lives WORKTREE=/usr/local/webalizer # Webalizer configs and scratch (see intro above). LOGS=/usr/local/apache/logs # Web logs - must exist WD=webalizer WDA=archives DONE=done CONFIG=webalizer.conf DOMAIN=bluehaze.com.au # Put your domain here. ARCHIVES=$LOGS/$WD/$WDA MFILE=month MF="${ARCHIVES}/$MFILE" DD="${ARCHIVES}/$DONE" LOGSUF=".access" # Any files in dir $LOGS with this suffix will be attacked. ID="/etc/rc.d/init.d" # Dir containing the scripts for starting and stopping apache, realmedia, etc PS_CMD="ps aufxww" # This is okay for Linux - Solaris needs "ps -ef" # Next 3 are for kludging the RealMedia logs in: MLOGDIR=/usr/local/bin/realserver/Logs # MMedia log dir MLOG=rmaccess.log # MMedia log MELOG=rmerror.log # MMedia error log MHOST=mm MLOGOUT=$LOGS/${MHOST}.access YEAR=`date '+%Y'` MONTH=`date '+%m'` # =========================== Code starts here ============================== date echo "----------------------------------" # An opening kludge - # # This first bit just translates a RealMedia log into something vaguely # resembling 'Common Log Format' and drops the result into the apache # logs directory so it will be processed by webalizer along with any # proper web logs. I always used to wonder if anyone was playing my RA # files ... now I can find out :-) cd $MLOGDIR echo "Checking for $MLOG in $MLOGDIR first:" if [ -s $MLOG ] then echo "$MLOG found - processing into ${MLOGOUT}:" # Rip off the RM end crud and reduce all space fields to one space: sed -e s'/\([0-9]\) \[.*$//' -e s'/ */ /g' -e s'/GET ramgen\//GET /' -e '/GET admin/d' -e s'/GET /GET \/audio\/ra\//' $MLOG > $MLOGOUT cat $MLOG >> ${MLOG}.bak # Keep copy of original just in case ... mv $MLOG junk # Do error log too: cat $MELOG >> ${MELOG}.bak 2>/dev/null mv $MELOG junkerr 2>/dev/null # Restart RM server: ${ID}/realmedia restart rm -f junk junkerr else echo "No $MLOG data found in $MLOGDIR" fi # --------------- End of the RealMedia log kludge ---------------------- # The REAL work starts now: resolve all logs using "logresolve": cd $LOGS # Create subdirs and (initialised) month file if/as necessary: echo "\nNow in `pwd`\n" if [ ! -d "$WD" ] then mkdir $WD 2>/dev/null fi if [ ! -d "$ARCHIVES" ] then mkdir $ARCHIVES 2>/dev/null fi if [ ! -d "$DD" ] then mkdir $DD 2>/dev/null fi if [ ! -f "$MF" ] then echo $MONTH > $MF fi LIST= LAST_MONTH=`cat $MF` # Working in the apache logs dir for this loop: for f in *$LOGSUF do OF="`echo $f | cut -d '.' -f1`" if [ -s $f ] then # Resolve this log if non-empty: CMD="$ABIN/logresolve < $f > $WD/${OF}" echo "$CMD" eval $CMD # Now append this result onto the existing (accumulated) one # in $ARCHIVES: CMD="cat $WD/${OF} >> $ARCHIVES/${OF}" echo "$CMD" eval $CMD else echo "$f was EMPTY - skipping it." fi LIST="$LIST $OF" done cd $WD HOSTS= for f in * do if [ -f $f ] then HOSTS="$HOSTS $f" fi done echo "----------------------------------" date echo "----------------------------------" # Now cd to the webalizer output data area and process each log: cd $WORKTREE echo "\nNow in `pwd` and variable HOSTS now =$HOSTS\n" for HOST in $HOSTS do NAME=`echo $HOST | cut -d '_' -f1` CMD="$BIN/webalizer -N 0 -d -p -c $HOST/$CONFIG -n $NAME.$DOMAIN -o $HOST $ARCHIVES/$HOST" echo "$CMD" $CMD done cd $LOGS echo "\nNow in `pwd`\n" # ---------------------------------------------------------------------- # This next "if-block" is only executed when the month rolls over. It # moves the $ARCHIVES files into a safe sub-dir ($DD) to clear the way # for the next month's data. This means webalizer only ever has to # process a maximum of one month's data. # # We can't include this code in the earlier for-loop (which has the same # parameters as this one) because this code must be run AFTER webalizer # has processed the files. # ---------------------------------------------------------------------- if [ $LAST_MONTH -ne $MONTH ] then # Update month: echo $MONTH > $MF for f in *$LOGSUF do OF="`echo $f | cut -d '.' -f1`" echo "---------------------------------------" echo "NEW MONTH - cleaning up $ARCHIVES/${OF}" echo "---------------------------------------" # Move the existing month's worth of resolved archived files # (in $ARCHIVES) down into $DD in case we ever need them again: CMD="cat ${ARCHIVES}/${OF} >> ${DD}/${OF}" echo "$CMD" eval $CMD # Clear files in the $ARCHIVES dir. Could just "rm" them, but this # way there's a file date to look at in case we're ever curious: CMD="cat /dev/null > ${ARCHIVES}/${OF}" echo "$CMD" eval $CMD done fi # (c) Finally, rotate the apache logs by: # (1) appending the apache logs to our archived logs, # (2) briefly renaming the apache logs, # (3) sending apache a kill -SIGHUP (via "www reload" - see below), and # (4) deleting the renamed apache logs. # This is as per the apache doco (ie: apache responds to a kill -SIGHUP) for f in *$LOGSUF do OF="`echo $f | cut -d '.' -f1`" if [ ! -d "$YEAR" ] then mkdir $YEAR 2>/dev/null fi if [ -s $f ] then # Append next apache log, then rename it by adding .junk: CMD="cat $f >> $YEAR/${OF}_${MONTH}; mv $f ${f}.junk" echo "$CMD" eval $CMD fi done # Finally, restart apache: CMD="$ID/www restart" echo "$CMD" eval $CMD sleep 30 # Give apache time to clean up and kill children for f in *.junk do rm -f $f done echo "-----------------------------------------------" date echo "------------------ DONE ----------------------" ## End of "roll_web_logs".