Checking RabbitMQ queue size/age with Nagios

For months we were using RabbitMQ (with celery) with no real insight into what was going on inside the queue. Recently, we deployed the mangement plug-in, which has a nifty web UI:

From there, is seemed like a logical thing to add a Nagios check for. Here is some Python code to do just that.

#!/usr/bin/python

from optparse import OptionParser
import urllib2
import json
import pprint
import time
import datetime

def getOptions():
    arguments = OptionParser()
    arguments.add_option("--host", dest="host", help="Host rabbitmq is running on", type="string", default="localhost")
    arguments.add_option("--queue", dest="queue", help="Name of the queue in inspect", type="string", default="celery")
    arguments.add_option("--username", dest="username", help="RabbitMQ API username", type="string", default="rabbitmq")
    arguments.add_option("--password", dest="password", help="RabbitMQ API password", type="string", default="rabbitmq")
    arguments.add_option("--port", dest="port", help="RabbitMQ API port", type="string", default="55672")

    arguments.add_option("--warning-queue-size", dest="warn_queue", help="Size of the queue to alert as warning", type="int", default=10000)
    arguments.add_option("--critical-queue-size", dest="crit_queue", help="Size of the queue to alert as critical", type="int", default=20000)

    arguments.add_option("--warning-seconds", dest="warn_seconds", help="Last event processes in seconds ago to alert as warning", type="int", default=3600)
    arguments.add_option("--critical-seconds", dest="crit_seconds", help="Last event processes in seconds ago to alert as critical", type="int", default=14400)

    return arguments.parse_args()[0]

if __name__ == '__main__':

    options = getOptions()

    url = "http://%s:%s/api/queues/reach/%s" % (options.host, options.port, options.queue)

    # handle HTTP Auth
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    top_level_url = url
    password_mgr.add_password(None, top_level_url, options.username, options.password)
    handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(handler)

    response = None
    try:
        request = opener.open(url)
        response = request.read()
        request.close()
    except urllib2.HTTPError, e:
        print "Error code %s hitting %s" % (e.code, url)
        exit(1)

    data = json.loads(response)

    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(data)

    num_messages = data.get("messages")
    if num_messages > options.crit_queue or num_messages > options.warn_queue:
        print "%s messages in %s queue" % (num_messages, options.queue)
        exit(1 if num_messages > options.crit_queue else 2)

    message_stats = data.get("message_stats")
    deliver_details = message_stats.get("deliver_details")
    rate = deliver_details.get("rate")

    #1309542487
    #1309548601517
    last_event = deliver_details.get("last_event") / 1000
    last_event_time = time.ctime(last_event)

    #diff = abs(last_event_time - datetime.datetime.today())
    #last_event_time_diff_seconds = diff.seconds + diff.days * 86400
    last_event_time_diff_seconds = abs(last_event - int(time.time()))
    if last_event_time_diff_seconds > options.crit_seconds or last_event_time_diff_seconds > options.warn_seconds:
        print "%s seconds since last event consumed on %s" % (last_event_time_diff_seconds, options.queue)
        exit(1 if last_event_time_diff_seconds > options.crit_seconds else 2)

    print "Last event consumed: %s" % last_event_time

You can run it manually:

check_rabbitmq --host sched1 --warning-queue-size 100 --critical-queue-size 400 --warning-seconds 600 --critical-seconds 2400

Or, you can configure Nagios to run it:

# vim /etc/nagios3/conf.d/services_nagios2.cfg
define command {
        command_name    check_rabbitmq
        command_line    /usr/lib/nagios/plugins/check_rabbitmq --host $HOSTADDRESS$ --queue $ARG1$ --warning-queue-size $ARG2$ --critical-queue-size $ARG3$ --warning-seconds $ARG4$ --critical-seconds $ARG5$
}

define service {
        host_name                       sched1
        service_description             QUEUE
        check_command                   check_rabbitmq!celery!100!400!600!2400
        contact_groups                  pager
        use                             generic-service
        notification_interval           0 ; set > 0 if you want to be renotified
}


I'm currently working at NerdWallet, a startup in San Francisco trying to bring clarity to all of life's financial decisions. We're hiring like crazy. Hit me up on Twitter, I would love to talk.

Follow @chase_seibert on Twitter