#!/bin/sh

# Progname is the program.  Used to name tempfiles, lockfiles, etc.
PROGNAME=agentupper

# All our output goes to the logfile
LOGFILE=/var/log/n-central/$PROGNAME.log
exec >> $LOGFILE 2>&1

# Comment this out to turn off debug
DEBUG=1

AGENTLOG=/var/log/n-central/nagent.log

[ $DEBUG ] && echo [`date`][$$] Now starting $PROGNAME

# First, we check to see if we're supposed to restart ourself.
# am I running from last time or did I crash
[ $DEBUG ] && echo [`date`][$$] "Now checking for a competing $PROGNAME"
if [ -a /var/run/$PROGNAME.pid ] ; then
	[ $DEBUG ] && echo [`date`][$$] "$PROGNAME PID File Found"
        echo $0 Error: PID file found.
        PID=`cat /var/run/$PROGNAME.pid`
        if [ -d /proc/$PID ] ; then
		[ $DEBUG ] && echo [`date`][$$] "$PROGNAME is still running"
                echo [`date`][$$] Error: Competing process found.
                kill $PID
        fi
        sleep 10
        if [ -d /proc/$PID ] ; then 
		[ $DEBUG ] && echo [`date`][$$] "Killed, but still running."
                echo [`date`][$$] Error: Competing process refuses to die.
                kill -9 $PID
        fi
        rm /var/run/$PROGNAME.pid
fi

# Leave some breadcrumbs so we can restart ourself if necessary
[ $DEBUG ] && echo [`date`][$$] "Writing my PID file"
echo $$ > /var/run/$PROGNAME.pid

sleep 1

# Check to ensure that we're the process that claimed dibs on the PID file
if [ -a /var/run/$PROGNAME.pid ] ; then
	[ $DEBUG ] && echo [`date`][$$] "PID file now exists."
        PID=`cat /var/run/$PROGNAME.pid`
        if [ $PID != $$ ] ; then
                echo [`date`][$$] Error: Someone beat us to writing the PID file.
		exit 0
        fi
	[ $DEBUG ] && echo [`date`][$$] "And it was me that made it."
else
	echo [`date`][$$] Error: $PROGNAME could not write its PID file.
	exit 1
fi

# Check to see if $PROGNAME is supposed to be running or not.
if [ ! -f /var/lock/subsys/$PROGNAME ] ; then
	[ $DEBUG ] && echo [`date`][$$] "I'm not supposed to be running.  Bye!"
        rm -f /var/run/$PROGNAME.pid
        exit 0
fi
[ $DEBUG ] && echo [`date`][$$] "Subsystem lock found, starting to process."

# If we get to this point, then we're the only $PROGNAME, and we're
# supposed to be running.

# Call in the status functions that we need
. /etc/rc.d/init.d/functions

# Function for checking and restarting a third-party daemon
checkme() {
	AFOUL=0

	if [ ! -r $PIDFILE ] ; then
		echo [`date`][$$] $PROGNAME: No $SERVICE PID file found.
		AFOUL=1
	else
		[ $DEBUG ] && echo [`date`][$$] "PID file found."
                PID=`head -1 $PIDFILE`
                if [ ! -d /proc/$PID ] ; then
			echo [`date`][$$] $PROGNAME: $SERVICE crashed.
			AFOUL=1
		else
			[ $DEBUG ] && echo [`date`][$$] "PID in the file exists"
			grep $GREPLINE /proc/$PID/cmdline > /dev/null 2>&1
			if [ $? -ne 0 ] ; then
				echo [`date`][$$] $PROGNAME: $SERVICE crashed and PID taken
				AFOUL=1
			else
				[ $DEBUG ] && echo [`date`][$$] "Process name correct."
			fi
		fi
	fi 
	if [ $AFOUL -ne 0 ] ; then
		$INITSCRIPT stop
		$INITSCRIPT start
	fi

	AFOUL=0
}

[ $DEBUG ] && echo [`date`][$$] AFOUL $AFOUL

# Run through the third-party daemons and check them

[ $DEBUG ] && echo [`date`][$$] "Now checking SSH"
SERVICE="SSH"
PIDFILE="/var/run/sshd.pid"
INITSCRIPT="/etc/rc.d/init.d/sshd"
GREPLINE="sshd"
checkme
[ $DEBUG ] && echo [`date`][$$] AFOUL $AFOUL

# We found nothing wrong yet
AFOUL=0

# There's something afoul if the PID file doesn't exist.
	[ $DEBUG ] && echo [`date`][$$] "Now checking nagent"
	if [ ! -r /var/run/nagent.pid ] ; then
		echo [`date`][$$] $PROGNAME: No nagent PID file found.
		AFOUL=1
	else
		# Since the PID file exists, there's something 
		# afoul if the PID in it doesn't exist
		PID=`cat /var/run/nagent.pid`
		if [ ! -d /proc/$PID ] ; then
			echo [`date`][$$] $PROGNAME: nagent crashed.
			AFOUL=1
		fi 
	fi
	[ $DEBUG ] && echo [`date`][$$] AFOUL $AFOUL

# There's something afoul if the agent thinks it's hit its
# max threads, but the number of nagents running is less than
# half the maximum.
[ $DEBUG ] && echo [`date`][$$] "Now checking if we leaked threads"
	# Find the number of nagents running
	COUNT=0 ; for PID in `pidof nagent` ; do COUNT=`expr $COUNT + 1` ; done ; echo $COUNT

	# Find half the number of nagents that's our max.
	MAXTHREADS=`grep -i thread_limitation /etc/nagent.conf | sed 's/.*=//'`
	MAXTHREADS=`expr $MAXTHREADS / 2`

	# Does the agent think there's a problem?
	tail -1 $AGENTLOG | grep "Meet max thread" > /dev/null 2>&1
	RC=$?
	if [ $RC -eq 0 ] ; then
		echo [`date`][$$] "Agent is claiming to have hit max threads"
		if [ $COUNT -lt $MAXTHREADS ] ; then
			#AFOUL=1
			echo [`date`][$$] "We have $COUNT nagents running with a max of" `grep thread_limitation /etc/nagent.conf` "in nagent.conf"
		fi
	fi
	
	[ $DEBUG ] && echo [`date`][$$] "AFOUL $AFOUL"


# There's something afoul if the nagent log file has hit the
# max errorcount and is sleeping for 5 minutes, but hasn't 
# written anything to its log file since we last checked.
[ $DEBUG ] && echo [`date`][$$] "Now checking if we're stuck"
	WATCHDOG=/var/lock/subsys/nagent.maxerrorcount.watchdog
	# Check to see if the last 10 lines of the log has max errorcount message
	tail -10 $AGENTLOG | grep "Max soap error count reached" > /dev/null 2>&1
	RC=$?
	if [ $RC -eq 0 ] ; then
		echo [`date`][$$] "Nagent hit max errorcount.  Checking for watchdog."
		if [ ! -r $WATCHDOG ] ; then
			echo [`date`][$$] "No watchdog exists yet.  Setting it."
			touch $WATCHDOG
		else
			echo [`date`][$$] "Watchdog exists from last $PROGNAME run."
			echo [`date`][$$] "Checking to see if nagent.log has been written to since."
			if [ $WATCHDOG -nt $AGENTLOG ] ; then
				echo [`date`][$$] "Agent log has not been updated since last run of $PROGNAME"
				echo [`date`][$$] "I take exception to that."
				AFOUL=1
			else
				echo [`date`][$$] "Agent log has been written to since last run of $PROGNAME"
				echo [`date`][$$] "Resetting watchdog"
				touch $WATCHDOG
			fi
		fi
	else
		echo [`date`][$$] "Agent log has no max errorcount, deleting watchdog."
		[ -r $WATCHDOG ] && rm -f $WATCHDOG
	fi
	[ $DEBUG ] && echo [`date`][$$] AFOUL=$AFOUL

# There's something afoul if /var/log/messages is giving us
# ethernet transmit errors.  In the last 25 entries in messages,
# You'd see a "Probably a duplex mismatch" message.
#[ $DEBUG ] && echo [`date`][$$] "Checking for duplex mismatch in log"
#	tail -25 /var/log/messages | grep "Probably a duplex mismatch" > /dev/null 2>&1
#	RC=$?
#	if [ $RC -eq 0 ] ; then
#		echo [`date`][$$] "Kernel giving ethernet duplex mismatch errors."
#		AFOUL=1
#		COUNT=0	# We blank out /var/log/messages so we don't hit this tiwce by mistake
#		while [ $COUNT -lt 25 ] ; do
#			logger DEADBEEF
#			logger D34DB33F
#			COUNT=`expr $COUNT + 1`
#		done
#	fi
#	[ $DEBUG ] && echo [`date`][$$] AFOUL $AFOUL


# Did we find anything broken?
if [ $AFOUL -ne 0 ] ; then
	echo [`date`][$$] $PROGNAME: I have detected a failure requiring a restart.
	/etc/rc.d/init.d/nagent stop
	wait
	killall -9 nagent
	wait
	sleep 30 
	/etc/rc.d/init.d/nagent start
	wait
	echo [`date`][$$] "$PROGNAME: All done, I think.  See you in 5 minutes."
fi

# Clean up after ourselves.
rm -f /var/run/$PROGNAME.pid

[ $DEBUG ] && echo [`date`][$$] Now finished $PROGNAME
