Skip to content

Commit

Permalink
Merge pull request #2412 from newrelic/release
Browse files Browse the repository at this point in the history
Release 5/16/24
  • Loading branch information
sarahkitten authored May 16, 2024
2 parents f4dd9c0 + 79f18d4 commit f32469c
Show file tree
Hide file tree
Showing 20 changed files with 454 additions and 21 deletions.
37 changes: 37 additions & 0 deletions alert-policies/nvml/HighTemperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: High GPU Temperature

description: |+
This alert is triggered when the GPU Temperature is exceeds 85 degrees Celsius for 5 minutes.
type: STATIC
nrql:
query: "SELECT latest(DCGM_FI_DEV_GPU_TEMP) AS 'gpu temperature' FROM Metric WHERE metricName LIKE 'DCGM_FI_DEV_GPU_TEMP'"

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL
- priority: WARNING
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 85
# Time in seconds; 120 - 3600
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL


# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 86400
354 changes: 354 additions & 0 deletions dashboards/nvml/nvml.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,354 @@
{
"name": "NVML",
"description": null,
"pages": [
{
"name": "NVML",
"description": null,
"widgets": [
{
"title": "",
"layout": {
"column": 1,
"row": 1,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.markdown"
},
"rawConfiguration": {
"text": "![NVML icon](https://raw.githubusercontent.com/newrelic/newrelic-quickstarts/main/quickstarts/nvidia-dcgm/logo.png)"
}
},
{
"title": "Device count",
"layout": {
"column": 4,
"row": 1,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(device_count) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "GPU Temperature ",
"layout": {
"column": 7,
"row": 1,
"width": 4,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(temperature_gpu) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"units": {
"unit": "CELSIUS"
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "Clock memory (mhz)",
"layout": {
"column": 11,
"row": 1,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_max_memory_mhz`) as 'MAX Memory',latest( ` clocks_current_memory_mhz`) AS 'Current Memory' FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "",
"layout": {
"column": 1,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.markdown"
},
"rawConfiguration": {
"text": "**About**\n\nInstrument your application with New Relic - [Add Data](https://one.newrelic.com/).\n\nInstrument NVML with New Relic using the [documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/nvml-integration/).\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=NVML) here and let us know how we can improve it for you."
}
},
{
"title": "Power usage (watts)",
"layout": {
"column": 3,
"row": 4,
"width": 5,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest( ` power_limit_w`), latest( ` power_draw_w`) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "Memory (mib)",
"layout": {
"column": 8,
"row": 4,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` memory_free_mib`) AS 'Free memory', latest(` memory_used_mib`) AS 'Used memory' FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Performance state",
"layout": {
"column": 11,
"row": 4,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` pstate`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Memory untilization",
"layout": {
"column": 1,
"row": 7,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` utilization_memory`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Clock application graphics (mhz)",
"layout": {
"column": 4,
"row": 7,
"width": 4,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.line"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"legend": {
"enabled": true
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_applications_graphics_mhz`) FROM nvmlSample TIMESERIES "
}
],
"platformOptions": {
"ignoreTimeRange": false
},
"thresholds": {
"isLabelVisible": true
},
"yAxisLeft": {
"zero": true
},
"yAxisRight": {
"zero": true
}
}
},
{
"title": "GPU utilization",
"layout": {
"column": 8,
"row": 7,
"width": 3,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` utilization_gpu`) FROM nvmlSample "
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
},
{
"title": "Clocks throttle reasons active",
"layout": {
"column": 11,
"row": 7,
"width": 2,
"height": 3
},
"linkedEntityGuids": null,
"visualization": {
"id": "viz.billboard"
},
"rawConfiguration": {
"facet": {
"showOtherSeries": false
},
"nrqlQueries": [
{
"accountIds": [],
"query": "SELECT latest(` clocks_throttle_reasons_active`) FROM nvmlSample"
}
],
"platformOptions": {
"ignoreTimeRange": false
}
}
}
]
}
],
"variables": []
}
Binary file added dashboards/nvml/nvml01.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit f32469c

Please sign in to comment.