Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
harrykimpel committed Nov 16, 2023
2 parents bfb21fd + 456dd5e commit 111d243
Show file tree
Hide file tree
Showing 35 changed files with 1,053 additions and 46 deletions.
27 changes: 27 additions & 0 deletions alert-policies/nvidia-dcgm/HighTemperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: High GPU Temperature

description: |+
TThis alert is triggered when the NVIDIA GPU Temperature is above 90%.
type: STATIC
nrql:
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
27 changes: 27 additions & 0 deletions alert-policies/nvidia-dcgm/XidError.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: XID Error

description: |+
This alert is triggered when the error is higher than 3 for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(DCGM_FI_DEV_XID_ERRORS) AS 'errors' FROM Metric WHERE metricName like 'DCGM_FI_DEV_XID_ERRORS'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 3
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
35 changes: 28 additions & 7 deletions dashboards/elasticsearch-elasticsearch/elasticsearch.json
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,16 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT (latest(elasticsearch.node.activeSearches) - earliest(elasticsearch.node.activeSearches))/(latest(elasticsearch.node.activeSearchesInMilliseconds) - earliest(elasticsearch.node.activeSearchesInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
"query" : "FROM Metric SELECT (latest(elasticsearch.node.activeSearchesInMilliseconds) - earliest(elasticsearch.node.activeSearchesInMilliseconds))/sum(elasticsearch.node.activeSearches) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
} ],
"nullValues": {
"nullValue": "zero"
},
"yAxisLeft" : {
"zero" : false
},
"units": {
"unit": "MS"
}
}
}, {
Expand All @@ -358,10 +364,16 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT (latest(elasticsearch.node.get.totalGetRequests) - earliest(elasticsearch.node.get.totalGetRequests))/(latest(elasticsearch.node.get.timeGetRequestsInMilliseconds) - earliest(elasticsearch.node.get.timeGetRequestsInMilliseconds)) AS 'Total', (latest(elasticsearch.node.get.requestsDocumentExists) - earliest(elasticsearch.node.get.requestsDocumentExists))/(latest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds)) AS 'Document Exists', (latest(elasticsearch.node.get.requestsDocumentMissing) - earliest(elasticsearch.node.get.requestsDocumentMissing))/(latest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds)) AS 'Document Missing' FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
"query" : "FROM Metric SELECT (latest(elasticsearch.node.get.totalGetRequests) - earliest(elasticsearch.node.get.totalGetRequests)) / (latest(elasticsearch.node.get.timeGetRequestsInMilliseconds) - earliest(elasticsearch.node.get.timeGetRequestsInMilliseconds)) AS 'Total', (latest(elasticsearch.node.get.requestsDocumentExists) - earliest(elasticsearch.node.get.requestsDocumentExists)) / (latest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentExistsInMilliseconds)) AS 'Document Exists', (latest(elasticsearch.node.get.requestsDocumentMissing) - earliest(elasticsearch.node.get.requestsDocumentMissing)) / (latest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds) - earliest(elasticsearch.node.get.requestsDocumentMissingInMilliseconds)) AS 'Document Missing' FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
} ],
"nullValues": {
"nullValue": "zero"
},
"yAxisLeft" : {
"zero" : false
},
"units": {
"unit": "MS"
}
}
}, {
Expand All @@ -382,7 +394,7 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT latest(elasticsearch.node.index.indexingOperationsFailed) - earliest(elasticsearch.node.index.indexingOperationsFailed) AS 'Failed Operations' FACET host.hostname"
"query" : "FROM Metric SELECT latest(elasticsearch.node.index.indexingOperationsFailed) - earliest(elasticsearch.node.index.indexingOperationsFailed) AS 'Failed Operations' FACET capture(entity.name, r'es-node:(?P<Node>.*)')"
} ]
}
}, {
Expand All @@ -405,7 +417,7 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT (latest(elasticsearch.node.indexing.documentsIndexed) - earliest(elasticsearch.node.indexing.documentsIndexed))/(latest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds) - earliest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
"query" : "FROM Metric SELECT (latest(elasticsearch.node.indexing.documentsIndexed) - earliest(elasticsearch.node.indexing.documentsIndexed))/(latest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds) - earliest(elasticsearch.node.indexing.timeIndexingDocumentsInMilliseconds)) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
} ],
"yAxisLeft" : {
"zero" : false
Expand All @@ -431,8 +443,11 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT (latest(elasticsearch.node.merges.segmentMerges) - earliest(elasticsearch.node.merges.segmentMerges))/(latest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds) - earliest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
"query" : "FROM Metric SELECT (latest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds) - earliest(elasticsearch.node.merges.totalSegmentMergingInMilliseconds)) / (latest(elasticsearch.node.merges.segmentMerges) - earliest(elasticsearch.node.merges.segmentMerges)) FACET capture(entity.name, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
} ],
"nullValues": {
"nullValue": "zero"
},
"yAxisLeft" : {
"zero" : false
}
Expand All @@ -457,12 +472,18 @@
},
"nrqlQueries" : [ {
"accountId" : 0,
"query" : "FROM Metric SELECT (latest(elasticsearch.node.refresh.total) - earliest(elasticsearch.node.refresh.total))/(latest(elasticsearch.node.refresh.totalInMilliseconds) - earliest(elasticsearch.node.refresh.totalInMilliseconds)) FACET host.hostname TIMESERIES SINCE 1 WEEK AGO"
"query" : "FROM Metric SELECT (latest(elasticsearch.node.refresh.totalInMilliseconds) - earliest(elasticsearch.node.refresh.totalInMilliseconds))/(latest(elasticsearch.node.refresh.total) - earliest(elasticsearch.node.refresh.total)) FACET capture(entityName, r'es-node:(?P<Node>.*)') TIMESERIES SINCE 1 WEEK AGO"
} ],
"nullValues": {
"nullValue": "zero"
},
"yAxisLeft" : {
"zero" : false
},
"units": {
"unit": "MS"
}
}
} ]
} ]
}
}
Loading

0 comments on commit 111d243

Please sign in to comment.