Skip to content

Commit 111d243

Browse files
committed
2 parents bfb21fd + 456dd5e commit 111d243

35 files changed

+1053
-46
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: High GPU Temperature
2+
3+
description: |+
4+
TThis alert is triggered when the NVIDIA GPU Temperature is above 90%.
5+
6+
type: STATIC
7+
nrql:
8+
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'"
9+
10+
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
11+
valueFunction: SINGLE_VALUE
12+
13+
# List of Critical and Warning thresholds for the condition
14+
terms:
15+
- priority: CRITICAL
16+
# Operator used to compare against the threshold.
17+
operator: ABOVE
18+
# Value that triggers a violation
19+
threshold: 90
20+
# Time in seconds; 120 - 3600
21+
thresholdDuration: 300
22+
# How many data points must be in violation for the duration
23+
thresholdOccurrences: ALL
24+
25+
# Duration after which a violation automatically closes
26+
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
27+
violationTimeLimitSeconds: 86400
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: XID Error
2+
3+
description: |+
4+
This alert is triggered when the error is higher than 3 for 5 minutes.
5+
6+
type: STATIC
7+
nrql:
8+
query: "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'"
9+
10+
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
11+
valueFunction: SINGLE_VALUE
12+
13+
# List of Critical and Warning thresholds for the condition
14+
terms:
15+
- priority: CRITICAL
16+
# Operator used to compare against the threshold.
17+
operator: ABOVE
18+
# Value that triggers a violation
19+
threshold: 3
20+
# Time in seconds; 120 - 3600
21+
thresholdDuration: 300
22+
# How many data points must be in violation for the duration
23+
thresholdOccurrences: ALL
24+
25+
# Duration after which a violation automatically closes
26+
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
27+
violationTimeLimitSeconds: 86400

dashboards/elasticsearch-elasticsearch/elasticsearch.json

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,16 @@
332332
},
333333
"nrqlQueries" : [ {
334334
"accountId" : 0,
335-
"query" : "FROM Metric SELECT (latest(elasticsearch.node.activeSearches) - earliest(elasticsearch.node.activeSearches))/(latest(elasticsearch.node.activeSearchesInMilliseconds) - earliest(elasticsearch.node.activeSearchesInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
335+
"query" : "FROM Metric SELECT (latest(elasticsearch.node.activeSearchesInMilliseconds) - earliest(elasticsearch.node.activeSearchesInMilliseconds))/sum(elasticsearch.node.activeSearches) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
336336
} ],
337+
"nullValues": {
338+
"nullValue": "zero"
339+
},
337340
"yAxisLeft" : {
338341
"zero" : false
342+
},
343+
"units": {
344+
"unit": "MS"
339345
}
340346
}
341347
}, {
@@ -358,10 +364,16 @@
358364
},
359365
"nrqlQueries" : [ {
360366
"accountId" : 0,
361-
"query" : "FROM Metric SELECT (latest(elasticsearch.node.get.totalGetRequests) - earliest(elasticsearch.node.get.totalGetRequests))/(latest(elasticsearch.node.get.timeGetRequestsInMilliseconds) - earliest(elasticsearch.node.get.timeGetRequestsInMilliseconds)) AS 'Total', (latest(elasticsearch.node.get.requestsDocumentExists) - earliest(elasticsearch.node.get.requestsDocumentExists))/(latest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds)) AS 'Document Exists', (latest(elasticsearch.node.get.requestsDocumentMissing) - earliest(elasticsearch.node.get.requestsDocumentMissing))/(latest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds)) AS 'Document Missing' FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
367+
"query" : "FROM Metric SELECT (latest(elasticsearch.node.get.totalGetRequests) - earliest(elasticsearch.node.get.totalGetRequests)) / (latest(elasticsearch.node.get.timeGetRequestsInMilliseconds) - earliest(elasticsearch.node.get.timeGetRequestsInMilliseconds)) AS 'Total', (latest(elasticsearch.node.get.requestsDocumentExists) - earliest(elasticsearch.node.get.requestsDocumentExists)) / (latest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds)) AS 'Document Exists', (latest(elasticsearch.node.get.requestsDocumentMissing) - earliest(elasticsearch.node.get.requestsDocumentMissing)) / (latest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds)) AS 'Document Missing' FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
362368
} ],
369+
"nullValues": {
370+
"nullValue": "zero"
371+
},
363372
"yAxisLeft" : {
364373
"zero" : false
374+
},
375+
"units": {
376+
"unit": "MS"
365377
}
366378
}
367379
}, {
@@ -382,7 +394,7 @@
382394
},
383395
"nrqlQueries" : [ {
384396
"accountId" : 0,
385-
"query" : "FROM Metric SELECT latest(elasticsearch.node.index.indexingOperationsFailed) - earliest(elasticsearch.node.index.indexingOperationsFailed) AS 'Failed Operations' FACET host.hostname"
397+
"query" : "FROM Metric SELECT latest(elasticsearch.node.index.indexingOperationsFailed) - earliest(elasticsearch.node.index.indexingOperationsFailed) AS 'Failed Operations' FACET capture(entity.name, r'es-node:(?P<Node>.*)')"
386398
} ]
387399
}
388400
}, {
@@ -405,7 +417,7 @@
405417
},
406418
"nrqlQueries" : [ {
407419
"accountId" : 0,
408-
"query" : "FROM Metric SELECT (latest(elasticsearch.node.indexing.documentsIndexed) - earliest(elasticsearch.node.indexing.documentsIndexed))/(latest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds) - earliest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
420+
"query" : "FROM Metric SELECT (latest(elasticsearch.node.indexing.documentsIndexed) - earliest(elasticsearch.node.indexing.documentsIndexed))/(latest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds) - earliest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds)) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
409421
} ],
410422
"yAxisLeft" : {
411423
"zero" : false
@@ -431,8 +443,11 @@
431443
},
432444
"nrqlQueries" : [ {
433445
"accountId" : 0,
434-
"query" : "FROM Metric SELECT (latest(elasticsearch.node.merges.segmentMerges) - earliest(elasticsearch.node.merges.segmentMerges))/(latest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds) - earliest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
446+
"query" : "FROM Metric SELECT (latest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds) - earliest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds)) / (latest(elasticsearch.node.merges.segmentMerges) - earliest(elasticsearch.node.merges.segmentMerges)) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
435447
} ],
448+
"nullValues": {
449+
"nullValue": "zero"
450+
},
436451
"yAxisLeft" : {
437452
"zero" : false
438453
}
@@ -457,12 +472,18 @@
457472
},
458473
"nrqlQueries" : [ {
459474
"accountId" : 0,
460-
"query" : "FROM Metric SELECT (latest(elasticsearch.node.refresh.total) - earliest(elasticsearch.node.refresh.total))/(latest(elasticsearch.node.refresh.totalInMilliseconds) - earliest(elasticsearch.node.refresh.totalInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
475+
"query" : "FROM Metric SELECT (latest(elasticsearch.node.refresh.totalInMilliseconds) - earliest(elasticsearch.node.refresh.totalInMilliseconds))/(latest(elasticsearch.node.refresh.total) - earliest(elasticsearch.node.refresh.total)) FACET capture(entityName, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
461476
} ],
477+
"nullValues": {
478+
"nullValue": "zero"
479+
},
462480
"yAxisLeft" : {
463481
"zero" : false
482+
},
483+
"units": {
484+
"unit": "MS"
464485
}
465486
}
466487
} ]
467488
} ]
468-
}
489+
}

0 commit comments

Comments
 (0)