From 49f1a22b380a74211f791f490d5cc7db5f8c901c Mon Sep 17 00:00:00 2001 From: Nicolas L Date: Mon, 28 Nov 2016 14:58:23 +0100 Subject: [PATCH 1/3] global var for threshold. Better OK message. --- check_es_jvm_usage.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/check_es_jvm_usage.py b/check_es_jvm_usage.py index 444bd93..7e62c78 100644 --- a/check_es_jvm_usage.py +++ b/check_es_jvm_usage.py @@ -11,6 +11,8 @@ class ESJVMHealthCheck(NagiosCheck): + default_critical_threshold = 85 + default_warning_threshold = 75 def __init__(self): @@ -20,16 +22,20 @@ def __init__(self): self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') self.add_option('C', 'critical_threshold', 'critical_threshold', 'The level at which we throw a CRITICAL alert' - ' - defaults to 97% of the JVM setting') + ' - defaults to ' + + str(ESJVMHealthCheck.default_critical_threshold) + +'% of the JVM setting') self.add_option('W', 'warning_threshold', 'warning_threshold', 'The level at which we throw a WARNING alert' - ' - defaults to 90% of the JVM setting') + ' - defaults to ' + + str(ESJVMHealthCheck.default_warning_threshold) + +'% of the JVM setting') def check(self, opts, args): host = opts.host port = int(opts.port or '9200') - critical = int(opts.critical_threshold or '97') - warning = int(opts.warning_threshold or '90') + critical = int(opts.critical_threshold or ESJVMHealthCheck.default_critical_threshold) + warning = int(opts.warning_threshold or ESJVMHealthCheck.default_warning_threshold) try: response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm' @@ -85,7 +91,8 @@ def check(self, opts, args): str("\r\n".join(warning_details)))) else: raise Status("OK", "All nodes in the cluster are currently below " - "the % JVM mem warning threshold") + "the %s%% JVM mem warning threshold" + % (ESJVMHealthCheck.default_warning_threshold)) if __name__ == "__main__": ESJVMHealthCheck().run() From e71a8a9e7ed029c8106ab0719294599fc97a0c1c Mon Sep 17 00:00:00 2001 From: Nicolas L Date: Mon, 28 Nov 2016 14:58:49 +0100 Subject: [PATCH 2/3] add check disk for low/high wartermark --- check_es_disk_usage.py | 100 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 check_es_disk_usage.py diff --git a/check_es_disk_usage.py b/check_es_disk_usage.py new file mode 100644 index 0000000..01f9ecc --- /dev/null +++ b/check_es_disk_usage.py @@ -0,0 +1,100 @@ +#!/usr/bin/python +from nagioscheck import NagiosCheck, UsageError +from nagioscheck import PerformanceMetric, Status +import urllib2 +import optparse + +try: + import json +except ImportError: + import simplejson as json + + +class ESDiskHealthCheck(NagiosCheck): + default_low_watermark = 85 + default_high_watermark = 95 + + def __init__(self): + + NagiosCheck.__init__(self) + + self.add_option('H', 'host', 'host', 'The cluster to check') + self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200') + self.add_option('C', 'critical_threshold', 'critical_threshold', + 'The level at which we throw a CRITICAL alert' + ' - defaults to high watermark: ' + + str(ESDiskHealthCheck.default_high_watermark) + + '% used of the disk setting') + self.add_option('W', 'warning_threshold', 'warning_threshold', + 'The level at which we throw a WARNING alert' + ' - defaults to the low watermark: ' + + str(ESDiskHealthCheck.default_low_watermark) + +'% used of the disk setting') + + def check(self, opts, args): + host = opts.host + port = int(opts.port or '9200') + critical = int(opts.critical_threshold or ESDiskHealthCheck.default_low_watermark) + warning = int(opts.warning_threshold or ESDiskHealthCheck.default_high_watermark) + + try: + response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/fs' + % (host, port)) + except urllib2.HTTPError, e: + raise Status('unknown', ("API failure", None, + "API failure:\n\n%s" % str(e))) + except urllib2.URLError, e: + raise Status('critical', (e.reason)) + + response_body = response.read() + + try: + nodes_disk_data = json.loads(response_body) + except ValueError: + raise Status('unknown', ("API returned nonsense",)) + + criticals = 0 + critical_details = [] + warnings = 0 + warning_details = [] + + nodes = nodes_disk_data['nodes'] + for node in nodes: + disk_total = nodes[node]['fs']['total']['total_in_bytes'] + disk_free = nodes[node]['fs']['total']['free_in_bytes'] + node_name = nodes[node]['host'] + disk_used_percent = (disk_total - disk_free) * 100 / disk_total + if int(disk_used_percent) >= critical: + criticals = criticals + 1 + critical_details.append("%s currently running at %s%% disk " + % (node_name, disk_used_percent)) + elif (int(disk_used_percent) >= warning and + int(disk_used_percent) < critical): + warnings = warnings + 1 + warning_details.append("%s currently running at %s%% disk " + % (node_name, disk_used_percent)) + + if criticals > 0: + raise Status("Critical", + "There are '%s' node(s) in the cluster that have " + "breached the %% disk usage critical threshold " + "of %s%%. They are:\r\n%s" + % ( + criticals, + critical, + str("\r\n".join(critical_details)) + )) + elif warnings > 0: + raise Status("Warning", + "There are '%s' node(s) in the cluster that have " + "breached the %% disk usage warning threshold of " + "%s%%. They are:\r\n%s" + % (warnings, warning, + str("\r\n".join(warning_details)))) + else: + raise Status("OK", "All nodes in the cluster are currently below " + "the % disk warning threshold") + +if __name__ == "__main__": + ESDiskHealthCheck().run() + From 9573e9382a3744a9d6d5f3df8f1a8ec6f315b33d Mon Sep 17 00:00:00 2001 From: Nicolas L Date: Mon, 28 Nov 2016 15:14:12 +0100 Subject: [PATCH 3/3] add space to OK message --- check_es_nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_es_nodes.py b/check_es_nodes.py index ba7c0a1..7c1cec7 100644 --- a/check_es_nodes.py +++ b/check_es_nodes.py @@ -49,7 +49,7 @@ def check(self, opts, args): "reporting as '%s' but we expected '%s'" % (active_cluster_nodes, nodes_in_cluster)) else: - raise Status('OK', "Number of nodes in the cluster is '%s'" + raise Status('OK', "Number of nodes in the cluster is '%s' " "which is >= %s as expected" % (active_cluster_nodes, nodes_in_cluster)) if __name__ == "__main__":