#!/bin/sh
#
#  roll_web_logs.  Author: Tony Sanderson, Bluehaze Solutions, 3/05/2000
#
#  NB: This 'file' is a "link" to the 'real thing' on this server.
#  ---------------------------------------------------------------
#
#  A shell to allow "webalizer" to run in direct (non incremental) mode
#  without taking an exponentially increasing amount of time to do it as
#  the server logs build up month by month.
#
#  Basically, it just shuffles the apache/realmedia logs around a bit to
#  minimise the amount of crunching that "webalizer" and "logresolve"
#  need to do.
#  ---------------------------------------------------------------------
#
#  HISTORY:
#
#  Add 30 sec sleep after restarting apache
#
#  Jan 21, 2001 - Improve the accuracy of these comments slightly :-)
#
#  Dec 2, 2000 (major change) - added extra code to move the resolved archive
#  files out of the way at the start of each month (into a further sub-dir),
#  so webalizer only has to process a maximum of 1 month of resolved data.
#  Previously, these were building up indefinitely, creating an ever-increasing
#  load.
#
#  Oct 30th 2000 - added extra code at the top to kludge in the RealMedia
#  server log (the kbyte figures seem to be rubbish but the hits may be ok)
#
#  May 2nd 2000 - modify stop/start of web server to use "reload" instead of
#  actually stopping and re-starting.  (From apache doco)
#                        ---------------------------
#
#    Synopsis:
#  A wrapper for web log analysis programs such as webalizer.  Allows one to
#  run webalizer in full ("Incremental no") mode without taking 3 weeks to do
#  the requisite resolving.
#
#   Description of operation:
#
#  (a) Resolves IP numbers in the current web logs,
#  (b) Appends these to the existing resolved logs, if any
#  (c) Processes these accumulated resolved logs with webalizer
#  (d) Renames existing log(s), reloads web server, deletes the renamed logs.
#
#   NOTE: These accumulated resolved logs (which webalizer processes) are now
#   "rolled away" into a sub-dir each month to ensure that a maximum of one
#   month's data needs processing - see history above (Dec 2, 2000).
#                    ----------------------------------
#
#  This script creates a noisy trace of everything it does, so make sure that
#  STDOUT and STDERR are directed to a suitable log if running via cron.
#
#  My cron entry looks is currently:
#  0 0,3,6,9,12,15,18,21 * * * /usr/local/sys/roll_web_logs > /usr/local/webalizer/Log 2>&1
#                    ----------------------------------
#
#  Directories used (as defined by constants further below) are:
#
#  LOG DATA
#
#    /usr/local/apache/logs
#  This is the default directory where apache keeps it logs.  Normally, these
#  logs are NOT resolved.
#
#    /usr/local/apache/logs/<year>/<logfile>_<month-number>
#  The accumulated (unresolved) logs from above.  Not used for anything
#  normally, just archived here in case we ever do need them.
#
#    /usr/local/apache/logs/webalizer/<name>
#  The resolved result(s) of the current apache log(s).  <name> is the first
#  part of the site IP name.  For a simple server, this will typically just
#  be "www".  If you have multiple access log files (due to, eg: the use of
#  "virtual hosts") there will normally be one file for each virtual host.
#  File(s) normally quite small if this script is run at least once a day.
#
#    /usr/local/apache/logs/webalizer/archives
#  The monthly-accumulated resolved log files from above.  These are the actual
#  files which webalizer processes.
#
#    /usr/local/apache/logs/webalizer/archives/done
#  The accumulated resolved log files from above.  The above monthly files are
#  concatenated onto these indefinitely.  Emergency backup only - sometimes
#  useful when things go "bang" when making major changes to this script :-)
#  May be deleted or pruned if and as desired.
#
#  CONFIG DATA (WEBALIZER)
#
#    /usr/local/webalizer/<name>
#  Directory(s) for webalizer config and other info for each site name.
#  Again - <name> here will be the first part of the site IP name.  If you're
#  running multiple host names (eg: apache's "virtual hosts"), then you will
#  almost certainly need multiple matching <name> directories, each with its
#  own private "webalizer.conf" file.
#
#  Note that these same directories normally also take all the OUTPUT data
#  (HTML and GIF/PNG files) which comprise the pages that you'll access for
#  seeing your stats.  Mind you ... you don't HAVE to do it that way.
#  ---------------------------------------------------------------------------

#  Define a few constants:

BIN=/usr/local/bin  # This is where my webalizer executable lives (default).
#BIN=/usr/bin  # RedHat executable
ABIN=/usr/local/apache/bin  # Dir where my apache "logresolve" lives
WORKTREE=/usr/local/webalizer # Webalizer configs and scratch (see intro above).
LOGS=/usr/local/apache/logs	  # Web logs - must exist
WD=webalizer
WDA=archives
DONE=done
CONFIG=webalizer.conf
DOMAIN=bluehaze.com.au   # Put your domain here.
ARCHIVES=$LOGS/$WD/$WDA
MFILE=month
MF="${ARCHIVES}/$MFILE"
DD="${ARCHIVES}/$DONE"
LOGSUF=".access"  # Any files in dir $LOGS with this suffix will be attacked.
ID="/etc/rc.d/init.d"  # Dir containing the scripts for starting and stopping apache, realmedia, etc
PS_CMD="ps aufxww"  # This is okay for Linux - Solaris needs "ps -ef"

#  Next 3 are for kludging the RealMedia logs in:
MLOGDIR=/usr/local/bin/realserver/Logs	  # MMedia log dir
MLOG=rmaccess.log	  # MMedia log
MELOG=rmerror.log	  # MMedia error log
MHOST=mm
MLOGOUT=$LOGS/${MHOST}.access
YEAR=`date '+%Y'`
MONTH=`date '+%m'`

# =========================== Code starts here ==============================

date
echo "----------------------------------"

#  An opening kludge -
#
#  This first bit just translates a RealMedia log into something vaguely
#  resembling 'Common Log Format' and drops the result into the apache
#  logs directory so it will be processed by webalizer along with any
#  proper web logs.  I always used to wonder if anyone was playing my RA
#  files ... now I can find out :-)

cd $MLOGDIR
echo "Checking for $MLOG in $MLOGDIR first:"
if [ -s $MLOG ]
then
	echo "$MLOG found - processing into ${MLOGOUT}:"
	# Rip off the RM end crud and reduce all space fields to one space:
	sed -e s'/\([0-9]\) \[.*$//' -e s'/  */ /g' -e s'/GET ramgen\//GET /' -e '/GET admin/d' -e s'/GET /GET \/audio\/ra\//' $MLOG > $MLOGOUT
	cat $MLOG >> ${MLOG}.bak	# Keep copy of original just in case ...
	mv $MLOG junk
	#  Do error log too:
	cat $MELOG >> ${MELOG}.bak 2>/dev/null
	mv $MELOG junkerr 2>/dev/null
	#  Restart RM server:
	${ID}/realmedia restart
	rm -f junk junkerr
else
	echo "No $MLOG data found in $MLOGDIR"
fi

#   --------------- End of the RealMedia log kludge ----------------------

#  The REAL work starts now: resolve all logs using "logresolve":

cd $LOGS

#  Create subdirs and (initialised) month file if/as necessary:
echo "\nNow in `pwd`\n"
if [ ! -d "$WD" ]
then
	mkdir $WD 2>/dev/null
fi

if [ ! -d "$ARCHIVES" ]
then
	mkdir $ARCHIVES 2>/dev/null
fi

if [ ! -d "$DD" ]
then
	mkdir $DD 2>/dev/null
fi

if [ ! -f "$MF" ]
then
	echo $MONTH > $MF
fi

LIST=
LAST_MONTH=`cat $MF`

#  Working in the apache logs dir for this loop:

for f in *$LOGSUF
do
	OF="`echo $f | cut -d '.' -f1`"
	if [ -s $f ]
	then
		#  Resolve this log if non-empty:
		CMD="$ABIN/logresolve  < $f > $WD/${OF}"
		echo "$CMD"
		eval $CMD

		#  Now append this result onto the existing (accumulated) one
		#  in $ARCHIVES:
		CMD="cat $WD/${OF} >> $ARCHIVES/${OF}"
		echo "$CMD"
		eval $CMD
	else
		echo "$f was EMPTY - skipping it."
	fi
	LIST="$LIST $OF"
done

cd $WD
HOSTS=
for f in *
do
	if [ -f $f ]
	then
		HOSTS="$HOSTS $f"
	fi
done

echo "----------------------------------"
date
echo "----------------------------------"
#   Now cd to the webalizer output data area and process each log:

cd $WORKTREE
echo "\nNow in `pwd` and variable HOSTS now =$HOSTS\n"
for HOST in $HOSTS
do
	NAME=`echo $HOST | cut -d '_' -f1`
	CMD="$BIN/webalizer -N 0 -d -p -c $HOST/$CONFIG -n $NAME.$DOMAIN -o $HOST $ARCHIVES/$HOST"
	echo "$CMD"
	$CMD
done

cd $LOGS
echo "\nNow in `pwd`\n"

#  ----------------------------------------------------------------------
#  This next "if-block" is only executed when the month rolls over.  It
#  moves the $ARCHIVES files into a safe sub-dir ($DD) to clear the way
#  for the next month's data.  This means webalizer only ever has to
#  process a maximum of one month's data.
#
#  We can't include this code in the earlier for-loop (which has the same
#  parameters as this one) because this code must be run AFTER webalizer
#  has processed the files.
#  ----------------------------------------------------------------------

if [ $LAST_MONTH -ne $MONTH ]
then
	# Update month:
	echo $MONTH > $MF

	for f in *$LOGSUF
	do
		OF="`echo $f | cut -d '.' -f1`"
		echo "---------------------------------------"
		echo "NEW MONTH - cleaning up $ARCHIVES/${OF}"
		echo "---------------------------------------"
		# Move the existing month's worth of resolved archived files
		# (in $ARCHIVES) down into $DD in case we ever need them again:
		CMD="cat ${ARCHIVES}/${OF} >> ${DD}/${OF}"
		echo "$CMD"
		eval $CMD

		# Clear files in the $ARCHIVES dir.  Could just "rm" them, but this
		# way there's a file date to look at in case we're ever curious:
		CMD="cat /dev/null > ${ARCHIVES}/${OF}"
		echo "$CMD"
		eval $CMD
	done
fi

#  (c) Finally, rotate the apache logs by:
#     (1) appending the apache logs to our archived logs,
#     (2) briefly renaming the apache logs,
#     (3) sending apache a kill -SIGHUP (via "www reload" - see below), and
#     (4) deleting the renamed apache logs.
#  This is as per the apache doco (ie: apache responds to a kill -SIGHUP)

for f in *$LOGSUF
do
	OF="`echo $f | cut -d '.' -f1`"
	if [ ! -d "$YEAR" ]
	then
		mkdir $YEAR 2>/dev/null
	fi
	if [ -s $f ]
	then
	#  Append next apache log, then rename it by adding .junk:
		CMD="cat $f >> $YEAR/${OF}_${MONTH}; mv $f ${f}.junk"
		echo "$CMD"
		eval $CMD
	fi
done

#  Finally, restart apache:
CMD="$ID/www restart"
echo "$CMD"
eval $CMD

sleep 30	# Give apache time to clean up and kill children
for f in *.junk
do
	rm -f $f
done

echo "-----------------------------------------------"
date
echo "------------------  DONE ----------------------"

##  End of "roll_web_logs".