Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add disk space check. Extract global vars. Change jvm threshold #4

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions check_es_disk_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/python
from nagioscheck import NagiosCheck, UsageError
from nagioscheck import PerformanceMetric, Status
import urllib2
import optparse

try:
import json
except ImportError:
import simplejson as json


class ESDiskHealthCheck(NagiosCheck):
default_low_watermark = 85
default_high_watermark = 95

def __init__(self):

NagiosCheck.__init__(self)

self.add_option('H', 'host', 'host', 'The cluster to check')
self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
self.add_option('C', 'critical_threshold', 'critical_threshold',
'The level at which we throw a CRITICAL alert'
' - defaults to high watermark: '
+ str(ESDiskHealthCheck.default_high_watermark)
+ '% used of the disk setting')
self.add_option('W', 'warning_threshold', 'warning_threshold',
'The level at which we throw a WARNING alert'
' - defaults to the low watermark: '
+ str(ESDiskHealthCheck.default_low_watermark)
+'% used of the disk setting')

def check(self, opts, args):
host = opts.host
port = int(opts.port or '9200')
critical = int(opts.critical_threshold or ESDiskHealthCheck.default_low_watermark)
warning = int(opts.warning_threshold or ESDiskHealthCheck.default_high_watermark)

try:
response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/fs'
% (host, port))
except urllib2.HTTPError, e:
raise Status('unknown', ("API failure", None,
"API failure:\n\n%s" % str(e)))
except urllib2.URLError, e:
raise Status('critical', (e.reason))

response_body = response.read()

try:
nodes_disk_data = json.loads(response_body)
except ValueError:
raise Status('unknown', ("API returned nonsense",))

criticals = 0
critical_details = []
warnings = 0
warning_details = []

nodes = nodes_disk_data['nodes']
for node in nodes:
disk_total = nodes[node]['fs']['total']['total_in_bytes']
disk_free = nodes[node]['fs']['total']['free_in_bytes']
node_name = nodes[node]['host']
disk_used_percent = (disk_total - disk_free) * 100 / disk_total
if int(disk_used_percent) >= critical:
criticals = criticals + 1
critical_details.append("%s currently running at %s%% disk "
% (node_name, disk_used_percent))
elif (int(disk_used_percent) >= warning and
int(disk_used_percent) < critical):
warnings = warnings + 1
warning_details.append("%s currently running at %s%% disk "
% (node_name, disk_used_percent))

if criticals > 0:
raise Status("Critical",
"There are '%s' node(s) in the cluster that have "
"breached the %% disk usage critical threshold "
"of %s%%. They are:\r\n%s"
% (
criticals,
critical,
str("\r\n".join(critical_details))
))
elif warnings > 0:
raise Status("Warning",
"There are '%s' node(s) in the cluster that have "
"breached the %% disk usage warning threshold of "
"%s%%. They are:\r\n%s"
% (warnings, warning,
str("\r\n".join(warning_details))))
else:
raise Status("OK", "All nodes in the cluster are currently below "
"the % disk warning threshold")

if __name__ == "__main__":
ESDiskHealthCheck().run()

17 changes: 12 additions & 5 deletions check_es_jvm_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@


class ESJVMHealthCheck(NagiosCheck):
default_critical_threshold = 85
default_warning_threshold = 75

def __init__(self):

Expand All @@ -20,16 +22,20 @@ def __init__(self):
self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
self.add_option('C', 'critical_threshold', 'critical_threshold',
'The level at which we throw a CRITICAL alert'
' - defaults to 97% of the JVM setting')
' - defaults to '
+ str(ESJVMHealthCheck.default_critical_threshold)
+'% of the JVM setting')
self.add_option('W', 'warning_threshold', 'warning_threshold',
'The level at which we throw a WARNING alert'
' - defaults to 90% of the JVM setting')
' - defaults to '
+ str(ESJVMHealthCheck.default_warning_threshold)
+'% of the JVM setting')

def check(self, opts, args):
host = opts.host
port = int(opts.port or '9200')
critical = int(opts.critical_threshold or '97')
warning = int(opts.warning_threshold or '90')
critical = int(opts.critical_threshold or ESJVMHealthCheck.default_critical_threshold)
warning = int(opts.warning_threshold or ESJVMHealthCheck.default_warning_threshold)

try:
response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm'
Expand Down Expand Up @@ -85,7 +91,8 @@ def check(self, opts, args):
str("\r\n".join(warning_details))))
else:
raise Status("OK", "All nodes in the cluster are currently below "
"the % JVM mem warning threshold")
"the %s%% JVM mem warning threshold"
% (ESJVMHealthCheck.default_warning_threshold))

if __name__ == "__main__":
ESJVMHealthCheck().run()
2 changes: 1 addition & 1 deletion check_es_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def check(self, opts, args):
"reporting as '%s' but we expected '%s'"
% (active_cluster_nodes, nodes_in_cluster))
else:
raise Status('OK', "Number of nodes in the cluster is '%s'"
raise Status('OK', "Number of nodes in the cluster is '%s' "
"which is >= %s as expected" % (active_cluster_nodes, nodes_in_cluster))

if __name__ == "__main__":
Expand Down