Checking RabbitMQ queue size/age with Nagios
For months we were using RabbitMQ (with celery) with no real insight into what was going on inside the queue. Recently, we deployed the mangement plug-in, which has a nifty web UI:
From there, is seemed like a logical thing to add a Nagios check for. Here is some Python code to do just that.
#!/usr/bin/python from optparse import OptionParser import urllib2 import json import pprint import time import datetime def getOptions(): arguments = OptionParser() arguments.add_option("--host", dest="host", help="Host rabbitmq is running on", type="string", default="localhost") arguments.add_option("--queue", dest="queue", help="Name of the queue in inspect", type="string", default="celery") arguments.add_option("--username", dest="username", help="RabbitMQ API username", type="string", default="rabbitmq") arguments.add_option("--password", dest="password", help="RabbitMQ API password", type="string", default="rabbitmq") arguments.add_option("--port", dest="port", help="RabbitMQ API port", type="string", default="55672") arguments.add_option("--warning-queue-size", dest="warn_queue", help="Size of the queue to alert as warning", type="int", default=10000) arguments.add_option("--critical-queue-size", dest="crit_queue", help="Size of the queue to alert as critical", type="int", default=20000) arguments.add_option("--warning-seconds", dest="warn_seconds", help="Last event processes in seconds ago to alert as warning", type="int", default=3600) arguments.add_option("--critical-seconds", dest="crit_seconds", help="Last event processes in seconds ago to alert as critical", type="int", default=14400) return arguments.parse_args()[0] if __name__ == '__main__': options = getOptions() url = "http://%s:%s/api/queues/reach/%s" % (options.host, options.port, options.queue) # handle HTTP Auth password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() top_level_url = url password_mgr.add_password(None, top_level_url, options.username, options.password) handler = urllib2.HTTPBasicAuthHandler(password_mgr) opener = urllib2.build_opener(handler) response = None try: request = opener.open(url) response = request.read() request.close() except urllib2.HTTPError, e: print "Error code %s hitting %s" % (e.code, url) exit(1) data = json.loads(response) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(data) num_messages = data.get("messages") if num_messages > options.crit_queue or num_messages > options.warn_queue: print "%s messages in %s queue" % (num_messages, options.queue) exit(1 if num_messages > options.crit_queue else 2) message_stats = data.get("message_stats") deliver_details = message_stats.get("deliver_details") rate = deliver_details.get("rate") #1309542487 #1309548601517 last_event = deliver_details.get("last_event") / 1000 last_event_time = time.ctime(last_event) #diff = abs(last_event_time - datetime.datetime.today()) #last_event_time_diff_seconds = diff.seconds + diff.days * 86400 last_event_time_diff_seconds = abs(last_event - int(time.time())) if last_event_time_diff_seconds > options.crit_seconds or last_event_time_diff_seconds > options.warn_seconds: print "%s seconds since last event consumed on %s" % (last_event_time_diff_seconds, options.queue) exit(1 if last_event_time_diff_seconds > options.crit_seconds else 2) print "Last event consumed: %s" % last_event_time
You can run it manually:
check_rabbitmq --host sched1 --warning-queue-size 100 --critical-queue-size 400 --warning-seconds 600 --critical-seconds 2400
Or, you can configure Nagios to run it:
# vim /etc/nagios3/conf.d/services_nagios2.cfg define command { command_name check_rabbitmq command_line /usr/lib/nagios/plugins/check_rabbitmq --host $HOSTADDRESS$ --queue $ARG1$ --warning-queue-size $ARG2$ --critical-queue-size $ARG3$ --warning-seconds $ARG4$ --critical-seconds $ARG5$ } define service { host_name sched1 service_description QUEUE check_command check_rabbitmq!celery!100!400!600!2400 contact_groups pager use generic-service notification_interval 0 ; set > 0 if you want to be renotified }