Hi,
I hope this is not considered as double post. I just wanted to have a extra post for the scripts I made.
The above written installation instructions point to those scripts.
nagiossync.py (synchronize host configuration and status information (check results, downtime, comments ...) from NagiosMaster to NagiosBackup)
#!/usr/bin/env python
#!/usr/bin/env python
import commands
import os
import time
import re
def logfilesize(logfile):
if os.path.exists(logfile):
logsize = os.path.getsize(logfile)
if logsize >= 10485760:
os.system('rm %s' %logfile)
def SyncConfigDir(sourcepath, desthost, destpath, logfile):
localtime = time.asctime(time.localtime(time.time()))
logfile = open(logfile, 'a')
logfile.write('\n### starting config sync at %s ###\n' %localtime)
status, output = commands.getstatusoutput("rsync -e 'ssh -p 22' -azvp %s %s:%s" %(sourcepath, desthost, destpath))
logfile.write(output)
logfile.write('\n### End of config sync ###\n')
logfile.close()
def SyncStatusInfo(sourcepath, destpath, desthost, retentionpath, logfile):
retentionfile = retentionpath+'/retention.dat'
localtime = time.asctime(time.localtime(time.time()))
logfile = open(logfile, 'a')
logfile.write('\n### starting to sync status information at %s ###\n' %localtime)
#get retention.dat from NagiosBackup
status, output = commands.getstatusoutput("rsync -chavzP --stats root@%s:%s/retention.dat /tmp" %(desthost, retentionpath))
if os.path.exists("/tmp/retention.dat"):
logfile.write('received retention.dat from %s\n' %desthost)
else:
logfile.write('error receiving retention.dat from %s\n' %desthost)
#open retention.dat from NagiosMaster
try:
openretfile = open(retentionfile, 'r')
retfilecontent = openretfile.read()
except IOError:
logfile.write('could not open %s' %retentionfile)
#remove unwanted information from retfilecontent
pattern = re.compile('(info\s\{)(.*?)(\})', re.DOTALL)
retfilecontent = pattern.sub('', retfilecontent)
pattern = re.compile('(program\s\{)(.*?)(\})', re.DOTALL)
retfilecontent = pattern.sub('', retfilecontent)
pattern = re.compile('(contact\s\{)(.*?)(\})', re.DOTALL)
retfilecontent = pattern.sub('', retfilecontent)
pattern = re.compile('[A-Za-z]*\s\{\shost_name=NagiosBackup(.*?)(\})', re.DOTALL)
retfilecontent = pattern.sub('', retfilecontent)
pattern = re.compile('[A-Za-z]*\s\{\shost_name=localhost(.*?)(\})', re.DOTALL)
retfilecontent = pattern.sub('', retfilecontent)
#read retention.dat file from NagiosBackup
logfile.write('\n### writing status state information to NagiosBackup retention.dat file ###\n')
try:
backupretfile = open('/tmp/retention.dat', 'r')
backupretfilecontent = backupretfile.read()
backupretfile.close()
except IOError:
logfile.write('Could not open retention.dat file from /tmp')
#remove unwanted state information (My hosts all start with an h or an m except the localhost and NagiosBackup/NagiosMaster)
pattern = re.compile('[A-Za-z]*\s\{\shost_name=H(.*?)(\})', re.DOTALL)
backupretfilecontent = pattern.sub('', backupretfilecontent)
pattern = re.compile('[A-Za-z]*\s\{\shost_name=M(.*?)(\})', re.DOTALL)
backupretfilecontent = pattern.sub('', backupretfilecontent)
#create new retention.dat file and send it to NagiosBackup
try:
newbackupretfile = open('/tmp/newretention.dat', 'w')
except IOError:
logfile.write('could not create new retention.dat file /tmp/newretention.dat')
newbackupretfile.write(backupretfilecontent)
newbackupretfile.write('\n# synced data from NagiosMaster\n')
newbackupretfile.write(retfilecontent)
newbackupretfile.close()
logfile.write('successfully added service state information to retention.dat file\n')
#set correct permissions to new retention file, rename, send and do the cleanup
os.system('rm /tmp/retention.dat')
os.system('mv /tmp/newretention.dat /tmp/retention.dat')
logfile.write('\n### stoping Nagios process on NagiosBackup ###\n')
status, output = commands.getstatusoutput("ssh root@%s 'service nagios stop'" %desthost)
logfile.write(output)
status, output = commands.getstatusoutput("ssh root@%s 'mv %s /usr/local/nagios/var/retention.dat_orig'" %(desthost, retentionfile))
logfile.write('\n### trying to copy retention.dat to NagiosBackup ###\n')
status, output = commands.getstatusoutput("rsync -e 'ssh -p 22' -azvp /tmp/retention.dat %s:%s" %(desthost, retentionpath))
logfile.write(output)
status, output = commands.getstatusoutput("ssh root@%s 'chown nagios:nagios %s'" %(desthost, retentionfile))
logfile.write('\n### trying to start nagios process in NagiosBackup again ###\n')
status, output = commands.getstatusoutput("ssh root@%s 'service nagios start'" %desthost)
logfile.write(output)
os.system('rm /tmp/retention.dat')
logfile.close()
def Restart(logfile, desthost):
#send service nagios restart to NagiosBackup to make him read the new synced configuration
localtime = time.asctime(time.localtime(time.time()))
logfile = open(logfile, 'a')
logfile.write('\n### sending restart command to %s at %s ###\n' %(desthost, localtime))
status, output = commands.getstatusoutput("ssh root@%s 'service nagios restart'" %desthost)
logfile.write(output)
logfile.write('\n### Nagios Process has been restarted ###\n')
logfile.close()
def checkFailover(desthost):
#check if a failover happend. If it is the case we don't copy the status informations
status, output = commands.getstatusoutput("ssh root@%s 'cat /usr/local/nagios/etc/failfile.log'" %desthost)
if "cat: /usr/local/nagios/etc/failfile.log: No such file or directory" in output:
return True
else:
return False
if __name__ == "__main__":
sourcepath = '/usr/local/nagios/etc/sites'
destpath = '/usr/local/nagios/etc/'
desthost = '10.132.72.171'
retentionpath = '/usr/local/nagios/var'
logfile = '/var/log/nagiossync.log'
logfilesize(logfile)
SyncConfigDir(sourcepath, desthost, destpath, logfile)
if checkFailover(desthost):
SyncStatusInfo(sourcepath, destpath, desthost, retentionpath, logfile)
#Restart(logfile, desthost)
failovercheck.py (check if NagiosMaster is alive and the nagios process is running. If not do a failover to NagiosBackup)
#!/usr/bin/env python
import commands
import sys
import time
import datetime
import os
def NagiosProcess(check_nrpe, nagiosmasterip):
status, output = commands.getstatusoutput("%s -H %s -c check_nagios_proc" %(check_nrpe, nagiosmasterip))
if output.startswith('PROCS CRITICAL'):
return True
else:
return False
def NagiosPing(check_ping, nagiosmasterip):
status, output = commands.getstatusoutput("%s -H %s -c check_nagios_proc" %(check_ping, nagiosmasterip))
if output.startswith('PING CRITICAL'):
return True
else:
return False
def failover(procRun, pingReply, nagiosmasterip, nagiossync, nagioscfg, logfile, failfile):
if procRun and not pingReply:
time.sleep(30)
procRun2 = NagiosProcess(check_nrpe, nagiosmasterip)
if procRun2:
logfile.write('\nNagios process on %s is not running! Performed two checks\n' %nagiosmasterip)
logfile.write('enabled execute_service_checks and enable_notifications in nagios.cfg\n')
os.system("sed -i 's/execute_service_checks=0/execute_service_checks=1/' %s" %nagioscfg)
os.system("sed -i 's/enable_notifications=0/enable_notifications=1/' %s" %nagioscfg)
os.system("ssh root@%s '%s'" %(nagiosmasterip, nagiossync))
logfile.write('executed nagiossync_v1.py on %s\n' %nagiosmasterip)
faillog = open(failfile, 'w')
faillog.write('failover')
faillog.close()
if (pingReply and procRun) or (pingReply and not procRun):
time.sleep(30)
pingReply2 = NagiosPing(check_ping, nagiosmasterip)
if pingReply2:
logfile.write('\nNagiosMaster does not respond! Performed two checks\n')
logfile.write('enabled execute_service_checks and enable_notifications in nagios.cfg\n')
os.system("sed -i 's/execute_service_checks=0/execute_service_checks=1/' %s" %nagioscfg)
os.system("sed -i 's/enable_notifications=0/enable_notifications=1/' %s" %nagioscfg)
logfile.write('restarting Nagios process')
status, output = commands.getstatusoutput('service nagios restart')
logfile.write(output)
faillog = open(failfile, 'w')
faillog.write('failover')
faillog.close()
def failback(procRun, pingReply, nagioscfg, failfile, logfile):
if not procRun and not pingReply:
logfile.write('\nNagiosMaster is back online again! disabling checks and notifications now\n')
os.system("sed -i 's/execute_service_checks=1/execute_service_checks=0/' %s" %nagioscfg)
os.system("sed -i 's/enable_notifications=1/enable_notifications=0/' %s" %nagioscfg)
logfile.write('restarting Nagios process')
status, output = commands.getstatusoutput('service nagios restart')
logfile.write(output)
os.system('rm %s' %failfile)
if __name__ == "__main__":
check_nrpe = '/usr/local/nagios/libexec/check_nrpe'
check_ping = '/usr/local/nagios/libexec/check_ping'
faillog = '/var/log/failover.log'
nagiossync = '/usr/local/nagios/etc/nagiossync_v1.py'
nagioscfg = '/usr/local/nagios/etc/nagios.cfg'
nagiosmasterip = '10.132.72.170'
failfile = '/usr/local/nagios/etc/failfile.log'
logfile = open(faillog, 'a')
procRun = NagiosProcess(check_nrpe, nagiosmasterip)
pingReply = NagiosPing(check_ping, nagiosmasterip)
if (not procRun and not pingReply) and os.path.exists(failfile):
failback(procRun, pingReply, nagioscfg, failfile, logfile)
if (procRun or pingReply) and not os.path.exists(failfile):
failover(procRun, pingReply, nagiosmasterip, nagiossync, nagioscfg, logfile, failfile)
logfile.close()
EDIT: both scripts are working for now. They need a bit of optimization but I will leave them for now.
EDIT: Fixed a little bug in the nagiossync script. If the Nagios Process on NagiosMaster failed we don't want to sync the status information from the master to the backup server anymore.
EDIT: I created a small plugin to check security alerts send from an ips to the snmptrapd service on the nagios server. As I wanted to have other information in the email as the information provided in the nagios webinterface I needed to let the email be generated by the plugin and not the standard way nagios does it. This has one con. You need to have the recipient in the plugin configured and can't use the standard configuration script nagios provides for email recipients (contacts.cfg).
#!/usr/bin/env python
import MySQLdb as mdb
import sys
import os
import socket
def getData(dbserver, dbuser, dbpass, db):
try:
dbcon = mdb.connect(dbserver, dbuser, dbpass, db)
dbcur = dbcon.cursor()
dbcur.execute('SELECT * FROM snmptt WHERE hostname LIKE "ips" AND severity LIKE "warning" OR severity LIKE "critical" ORDER BY "id"')
dbdata = dbcur.fetchall()
return dbdata
except mdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
sys.exit(2) #CRITICAL = 2
finally:
if dbcon:
dbcon.close()
def notify(dbdata, contacts, dbserver, dbuser, dbpass, db ):
i = 0
while i < int(len(dbdata)):
#dbdata[i][0] = id, dbdata[i][6] = hostname, dbdata[i][9] = severity, dbdata[i][12] = traptext, dbdata[i][11] = traptime
try:
host = socket.gethostbyaddr(dbdata[i][6])
host = host[0]
except socket.herror:
host = dbdata[i][6]
try:
address = socket.gethostbyname(dbdata[i][6])
except socket.herror:
address = dbdata[i][6]
for contact in contacts:
os.system('echo "*****Nagios*****\n\nNotification Type: %s\n\nService: SNMP Traps\nHost: %s\nAddress: %s\nState: %s\n\nDate/Time: %s\n\nAdditional Info:\n\n%s" | mail -s "** Security Alert %s **" %s' %(dbdata[i][9], host, address, dbdata[i][9], dbdata[i][11], dbdata[i][12], host, contact))
deleteTrap(dbdata[i][0], dbserver, dbuser, dbpass, db)
i += 1
def deleteTrap(id, dbserver, dbuser, dbpass, db):
try:
dbcon = mdb.connect(dbserver, dbuser, dbpass, db)
dbcur = dbcon.cursor()
dbcur.execute('DELETE FROM db WHERE id=%s' %id)
dbdata = dbcur.fetchall()
return dbdata
except mdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
sys.exit(2) #CRITICAL = 2
finally:
if dbcon:
dbcon.close()
def nagiosStatus(dbdata):
warningCount = len(dbdata)
if warningCount != 0:
print "TRAPS WARNING: %s traps in database - Mail will be send and trap deleted" %warningCount
sys.exit(1) #WARNING = 1
elif warningCount == 0:
print "TRAPS OK: no warning traps in database"
sys.exit(0) #OK = 0
if __name__ == "__main__":
dbserver = 'localhost'
dbuser = 'dbuser'
dbpass = 'dbpass'
db = 'db'
contacts = ['recipient1@domain.com', 'recipient2@domain.com']
dbdata = getData(dbserver, dbuser, dbpass, db)
if dbdata is not None:
notify(dbdata, contacts, dbserver, dbuser, dbpass, db)
nagiosStatus(dbdata)