-
Notifications
You must be signed in to change notification settings - Fork 302
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2412 from newrelic/release
Release 5/16/24
- Loading branch information
Showing
20 changed files
with
454 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
name: High GPU Temperature | ||
|
||
description: |+ | ||
This alert is triggered when the GPU Temperature is exceeds 85 degrees Celsius for 5 minutes. | ||
type: STATIC | ||
nrql: | ||
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'" | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 90 | ||
# Time in seconds; 120 - 3600 | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
- priority: WARNING | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 85 | ||
# Time in seconds; 120 - 3600 | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 86400 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,354 @@ | ||
{ | ||
"name": "NVML", | ||
"description": null, | ||
"pages": [ | ||
{ | ||
"name": "NVML", | ||
"description": null, | ||
"widgets": [ | ||
{ | ||
"title": "", | ||
"layout": { | ||
"column": 1, | ||
"row": 1, | ||
"width": 3, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.markdown" | ||
}, | ||
"rawConfiguration": { | ||
"text": "![NVML icon](https://raw.githubusercontent.com/newrelic/newrelic-quickstarts/main/quickstarts/nvidia-dcgm/logo.png)" | ||
} | ||
}, | ||
{ | ||
"title": "Device count", | ||
"layout": { | ||
"column": 4, | ||
"row": 1, | ||
"width": 3, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(device_count) FROM nvmlSample" | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "GPU Temperature ", | ||
"layout": { | ||
"column": 7, | ||
"row": 1, | ||
"width": 4, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.line" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"legend": { | ||
"enabled": true | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(temperature_gpu) FROM nvmlSample TIMESERIES " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
}, | ||
"thresholds": { | ||
"isLabelVisible": true | ||
}, | ||
"units": { | ||
"unit": "CELSIUS" | ||
}, | ||
"yAxisLeft": { | ||
"zero": true | ||
}, | ||
"yAxisRight": { | ||
"zero": true | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Clock memory (mhz)", | ||
"layout": { | ||
"column": 11, | ||
"row": 1, | ||
"width": 2, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` clocks_max_memory_mhz`) as 'MAX Memory',latest( ` clocks_current_memory_mhz`) AS 'Current Memory' FROM nvmlSample " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "", | ||
"layout": { | ||
"column": 1, | ||
"row": 4, | ||
"width": 2, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.markdown" | ||
}, | ||
"rawConfiguration": { | ||
"text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com/).\n\nInstrument NVML with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvml-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=NVML) here and let us know how we can improve it for you." | ||
} | ||
}, | ||
{ | ||
"title": "Power usage (watts)", | ||
"layout": { | ||
"column": 3, | ||
"row": 4, | ||
"width": 5, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.line" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"legend": { | ||
"enabled": true | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest( ` power_limit_w`), latest( ` power_draw_w`) FROM nvmlSample TIMESERIES " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
}, | ||
"thresholds": { | ||
"isLabelVisible": true | ||
}, | ||
"yAxisLeft": { | ||
"zero": true | ||
}, | ||
"yAxisRight": { | ||
"zero": true | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Memory (mib)", | ||
"layout": { | ||
"column": 8, | ||
"row": 4, | ||
"width": 3, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` memory_free_mib`) AS 'Free memory', latest(` memory_used_mib`) AS 'Used memory' FROM nvmlSample " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Performance state", | ||
"layout": { | ||
"column": 11, | ||
"row": 4, | ||
"width": 2, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` pstate`) FROM nvmlSample" | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Memory untilization", | ||
"layout": { | ||
"column": 1, | ||
"row": 7, | ||
"width": 3, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` utilization_memory`) FROM nvmlSample" | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Clock application graphics (mhz)", | ||
"layout": { | ||
"column": 4, | ||
"row": 7, | ||
"width": 4, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.line" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"legend": { | ||
"enabled": true | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` clocks_applications_graphics_mhz`) FROM nvmlSample TIMESERIES " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
}, | ||
"thresholds": { | ||
"isLabelVisible": true | ||
}, | ||
"yAxisLeft": { | ||
"zero": true | ||
}, | ||
"yAxisRight": { | ||
"zero": true | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "GPU utilization", | ||
"layout": { | ||
"column": 8, | ||
"row": 7, | ||
"width": 3, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` utilization_gpu`) FROM nvmlSample " | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
}, | ||
{ | ||
"title": "Clocks throttle reasons active", | ||
"layout": { | ||
"column": 11, | ||
"row": 7, | ||
"width": 2, | ||
"height": 3 | ||
}, | ||
"linkedEntityGuids": null, | ||
"visualization": { | ||
"id": "viz.billboard" | ||
}, | ||
"rawConfiguration": { | ||
"facet": { | ||
"showOtherSeries": false | ||
}, | ||
"nrqlQueries": [ | ||
{ | ||
"accountIds": [], | ||
"query": "SELECT latest(` clocks_throttle_reasons_active`) FROM nvmlSample" | ||
} | ||
], | ||
"platformOptions": { | ||
"ignoreTimeRange": false | ||
} | ||
} | ||
} | ||
] | ||
} | ||
], | ||
"variables": [] | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.