Skip to content

Commit

Permalink
Added GPU metrics retrival
Browse files Browse the repository at this point in the history
Signed-off-by: pierantoniomerlino <[email protected]>
  • Loading branch information
pierantoniomerlino committed Feb 14, 2025
1 parent 455b2a1 commit 8d0d496
Show file tree
Hide file tree
Showing 6 changed files with 236 additions and 92 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Import-Package: com.google.common.base;version="25.0.0",
com.google.common.io;version="25.0.0",
com.google.common.util.concurrent;version="25.0.0",
com.google.gson;version="2.7.0",
com.google.gson.annotations;version="2.9.0",
com.google.protobuf;version="3.19.3",
org.apache.commons.io;version="2.4.0",
org.apache.commons.io.filefilter;version="2.11.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*******************************************************************************
* Copyright (c) 2025 Eurotech and/or its affiliates and others
*
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Eurotech
******************************************************************************/
package org.eclipse.kura.ai.triton.server;

import java.util.HashMap;
import java.util.Map;

import com.google.gson.annotations.SerializedName;

public class GpuMetrics {

@SerializedName(value = "gpu_uuid")
private final String uuid;
@SerializedName(value = "gpu_stats")
private final Map<String, String> metrics;

public GpuMetrics(final String gpuUuid) {
this.uuid = gpuUuid;
this.metrics = new HashMap<>();
}

public String getGpuUuid() {
return this.uuid;
}

public Map<String, String> getGpuMetrics() {
return this.metrics;
}

public void addGpuMetric(final String metricName, final String metricValue) {
this.metrics.put(metricName, metricValue);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*******************************************************************************
* Copyright (c) 2025 Eurotech and/or its affiliates and others
*
* This program and the accompanying materials are made
* available under the terms of the Eclipse Public License 2.0
* which is available at https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* Eurotech
******************************************************************************/
package org.eclipse.kura.ai.triton.server;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

public class GpuMetricsParser {

private final List<String> rawMetrics;
private final Map<String, GpuMetrics> gpuMetricsMap = new HashMap<>();
private final GsonBuilder gsonBuilder = new GsonBuilder();
private final Gson gson = gsonBuilder.create();

public GpuMetricsParser(List<String> metrics) {
this.rawMetrics = metrics;
}

/**
* Parse the gpu metrics provided by a Triton Server. An example of metrics are the following:
*
* # HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)
* # TYPE nv_gpu_utilization gauge
* nv_gpu_utilization{gpu_uuid="GPU-340cec52-80ba-c0df-8511-5f9680aae0ed"} 0.000000
* # HELP nv_gpu_memory_total_bytes GPU total memory, in bytes
* # TYPE nv_gpu_memory_total_bytes gauge
* nv_gpu_memory_total_bytes{gpu_uuid="GPU-340cec52-80ba-c0df-8511-5f9680aae0ed"} 16101933056.000000
* # HELP nv_gpu_memory_used_bytes GPU used memory, in bytes
* # TYPE nv_gpu_memory_used_bytes gauge
* nv_gpu_memory_used_bytes{gpu_uuid="GPU-340cec52-80ba-c0df-8511-5f9680aae0ed"} 617611264.000000
*
* The lines beginning with a # are filtered. The metric name is the 'gpu_uuid' field.
* The value is converted in json format. For example:
*
* <pre>
* {
* "gpu_uuid" : "GPU-340cec52-80ba-c0df-8511-5f9680aae0ed",
* "gpu_stats" : {
* "gpu_utilization": "0.000000"
* }
* }
* </pre>
*/
public Map<String, String> parse() {
Map<String, String> metrics = new HashMap<>();
rawMetrics.stream().filter(
line -> !line.startsWith("#") && (line.contains("_gpu_") || line.equals("nv_energy_consumption")))
.forEach(line -> {
Optional<String> uuid = parseUuid(line);
Optional<String> name = parseName(line);
String value = parseValue(line);
if (uuid.isEmpty() || name.isEmpty()) {
return;
}
if (gpuMetricsMap.containsKey(uuid.get())) {
this.gpuMetricsMap.get(uuid.get()).addGpuMetric(name.get(), value);
} else {
GpuMetrics gpuMetrics = new GpuMetrics(uuid.get());
gpuMetrics.addGpuMetric(name.get(), value);
this.gpuMetricsMap.put(uuid.get(), gpuMetrics);
}
});
this.gpuMetricsMap.forEach((key, value) -> metrics.put(key, gson.toJson(value)));
return metrics;
}

private Optional<String> parseName(String line) {
Optional<String> name = Optional.empty();
String[] elements = line.split("\\{");
if (elements.length >= 1) {
name = Optional.of(elements[0]);
}
return name;
}

private Optional<String> parseUuid(String line) {
Optional<String> uuid = Optional.empty();
String[] elements = line.split("gpu_uuid=\"");
if (elements.length >= 2) {
String subElement = elements[1].split("\"}")[0];
if (subElement != null && !subElement.isEmpty()) {
uuid = Optional.of(subElement);
}
}
return uuid;
}

private String parseValue(String line) {
String value = "";
String[] elements = line.split("\\s+");
if (elements.length >= 2) {
value = elements[1];
}
return value;
}

}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,22 @@ public List<Tensor> infer(ModelInfo modelInfo, List<Tensor> inputData) throws Ku

@Override
public Map<String, String> getMetrics() throws KuraException {
Map<String, String> metrics = new HashMap<>();
metrics.putAll(getModelStatistics());
metrics.putAll(getGpuMetrics());

return metrics;
}

private Map<String, String> getModelStatistics() {
Map<String, String> statistics = new HashMap<>();
return statistics;
}

private Map<String, String> getGpuMetrics() throws KuraException {
List<String> response = getListMetrics();

MetricsParser metricsParser = new MetricsParser(response);
GpuMetricsParser metricsParser = new GpuMetricsParser(response);
return metricsParser.parse();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.eclipse.kura.ai.triton.server;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.junit.Test;

public class MetricsParserTest {

private GpuMetricsParser metricsParser;
private Map<String, String> metricsMap;

private static final String GPU_METRIC = //
"# HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)\n" //
+ "# TYPE nv_gpu_utilization gauge\n" //
+ "nv_gpu_utilization{gpu_uuid=\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\"} 0.000000\n" //
+ "# HELP nv_gpu_memory_total_bytes GPU total memory, in bytes\n" //
+ "# TYPE nv_gpu_memory_total_bytes gauge\n" //
+ "nv_gpu_memory_total_bytes{gpu_uuid=\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\"} 16101933056.000000\n" //
+ "# HELP nv_gpu_memory_used_bytes GPU used memory, in bytes\n"
+ "# TYPE nv_gpu_memory_used_bytes gauge\n" //
+ "nv_gpu_memory_used_bytes{gpu_uuid=\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\"} 617611264.000000\n" //
+ "# HELP nv_gpu_power_usage GPU power usage in watts\n" + "# TYPE nv_gpu_power_usage gauge\n" //
+ "nv_gpu_power_usage{gpu_uuid=\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\"} 20.085000\n" //
+ "# HELP nv_gpu_power_limit GPU power management limit in watts\n"
+ "# TYPE nv_gpu_power_limit gauge"; //
private static final String SINGLE_GPU_METRIC = //
"# HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)\n" //
+ "# TYPE nv_gpu_utilization gauge\n" //
+ "nv_gpu_utilization{gpu_uuid=\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\"} 0.000000"; //
private static final String EXPECTED_SINGLE_GPU_METRIC = //
"{\"gpu_uuid\":\"GPU-340cec52-80ba-c0df-8511-5f9680aae0ed\",\"gpu_stats\":{\"nv_gpu_utilization\":\"0.000000\"}}";

@Test
public void shouldParseSingleGpuMetric() {
givenMetricsParser(Arrays.asList(SINGLE_GPU_METRIC.split("\n")));

whenParse();

thenSingleMetricIsParsed("GPU-340cec52-80ba-c0df-8511-5f9680aae0ed");
thenSingleMetricIs("GPU-340cec52-80ba-c0df-8511-5f9680aae0ed", EXPECTED_SINGLE_GPU_METRIC);
}

private void givenMetricsParser(List<String> metrics) {
this.metricsParser = new GpuMetricsParser(metrics);
}

private void whenParse() {
this.metricsMap = this.metricsParser.parse();
}

private void thenSingleMetricIsParsed(String expectedMetricName) {
assertFalse(this.metricsMap.isEmpty());
assertTrue(this.metricsMap.containsKey(expectedMetricName));
}

private void thenSingleMetricIs(String expectedMetricName, String expectedMetric) {
System.out.println(this.metricsMap);
assertEquals(expectedMetric.trim().replace("\n", ""), this.metricsMap.get(expectedMetricName));
}
}

0 comments on commit 8d0d496

Please sign in to comment.