Skip to content

Commit 0b5fffc

Browse files
Merge pull request #275 from KernelTuner/tegra_observer
Tegra observer with continuous observer
2 parents f307f50 + 1e9a55b commit 0b5fffc

File tree

5 files changed

+447
-76
lines changed

5 files changed

+447
-76
lines changed
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python
2+
"""This is the minimal example from the README"""
3+
4+
import json
5+
6+
import numpy
7+
from kernel_tuner import tune_kernel
8+
from kernel_tuner.observers.tegra import TegraObserver
9+
10+
def tune():
11+
12+
kernel_string = """
13+
__global__ void vector_add(float *c, float *a, float *b, int n) {
14+
int i = blockIdx.x * block_size_x + threadIdx.x;
15+
if (i<n) {
16+
c[i] = a[i] + b[i];
17+
}
18+
}
19+
"""
20+
21+
size = 800000
22+
23+
a = numpy.random.randn(size).astype(numpy.float32)
24+
b = numpy.random.randn(size).astype(numpy.float32)
25+
c = numpy.zeros_like(b)
26+
n = numpy.int32(size)
27+
28+
args = [c, a, b, n]
29+
30+
tune_params = dict()
31+
tune_params["block_size_x"] = [128+64*i for i in range(15)]
32+
33+
tegraobserver = TegraObserver(["core_freq"])
34+
35+
metrics = dict()
36+
metrics["f"] = lambda p: p["core_freq"]
37+
38+
results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[tegraobserver], metrics=metrics)
39+
40+
print(results)
41+
42+
return results
43+
44+
45+
if __name__ == "__main__":
46+
tune()

kernel_tuner/core.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@
2020
from kernel_tuner.backends.nvcuda import CudaFunctions
2121
from kernel_tuner.backends.opencl import OpenCLFunctions
2222
from kernel_tuner.backends.compiler import CompilerFunctions
23-
from kernel_tuner.backends.opencl import OpenCLFunctions
24-
from kernel_tuner.backends.hip import HipFunctions
2523
from kernel_tuner.observers.nvml import NVMLObserver
24+
from kernel_tuner.observers.tegra import TegraObserver
2625
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
2726

2827
try:
@@ -316,8 +315,9 @@ def __init__(
316315
raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
317316
self.dev = dev
318317

319-
# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
318+
# look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
320319
self.use_nvml = False
320+
self.use_tegra = False
321321
self.continuous_observers = []
322322
self.output_observers = []
323323
self.prologue_observers = []
@@ -326,6 +326,9 @@ def __init__(
326326
if isinstance(obs, NVMLObserver):
327327
self.nvml = obs.nvml
328328
self.use_nvml = True
329+
if isinstance(obs, TegraObserver):
330+
self.tegra = obs.tegra
331+
self.use_tegra = True
329332
if hasattr(obs, "continuous_observer"):
330333
self.continuous_observers.append(obs.continuous_observer)
331334
if isinstance(obs, OutputObserver):
@@ -382,6 +385,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
382385
for obs in self.benchmark_observers:
383386
result.update(obs.get_results())
384387

388+
385389
def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
386390
"""Benchmark continuously for at least 'duration' seconds"""
387391
iterations = int(np.ceil(duration / (result["time"] / 1000)))
@@ -405,6 +409,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
405409
for obs in self.continuous_observers:
406410
result.update(obs.get_results())
407411

412+
408413
def set_nvml_parameters(self, instance):
409414
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
410415
if self.use_nvml:
@@ -419,6 +424,11 @@ def set_nvml_parameters(self, instance):
419424
if "nvml_mem_clock" in instance.params:
420425
self.nvml.mem_clock = instance.params["nvml_mem_clock"]
421426

427+
if self.use_tegra:
428+
if "tegra_gr_clock" in instance.params:
429+
self.tegra.gr_clock = instance.params["tegra_gr_clock"]
430+
431+
422432
def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
423433
"""Benchmark the kernel instance."""
424434
logging.debug("benchmark " + instance.name)

kernel_tuner/observers/nvml.py

+6-71
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def __init__(
323323
save_all=False,
324324
nvidia_smi_fallback=None,
325325
use_locked_clocks=False,
326-
continous_duration=1,
326+
continuous_duration=1,
327327
):
328328
"""Create an NVMLObserver."""
329329
if nvidia_smi_fallback:
@@ -355,7 +355,7 @@ def __init__(
355355
if any([obs in self.needs_power for obs in observables]):
356356
self.measure_power = True
357357
power_observables = [obs for obs in observables if obs in self.needs_power]
358-
self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration)
358+
self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)
359359

360360
# remove power observables
361361
self.observables = [obs for obs in observables if obs not in self.needs_power]
@@ -373,6 +373,10 @@ def __init__(
373373
self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]]
374374
self.iteration = {obs: [] for obs in self.during_obs}
375375

376+
def read_power(self):
377+
""" Return power in Watt """
378+
return self.nvml.pwr_usage() / 1e3
379+
376380
def before_start(self):
377381
# clear results of the observables for next measurement
378382
self.iteration = {obs: [] for obs in self.during_obs}
@@ -428,75 +432,6 @@ def get_results(self):
428432
return averaged_results
429433

430434

431-
class NVMLPowerObserver(ContinuousObserver):
432-
"""Observer that measures power using NVML and continuous benchmarking."""
433-
434-
def __init__(self, observables, parent, nvml_instance, continous_duration=1):
435-
self.parent = parent
436-
self.nvml = nvml_instance
437-
438-
supported = ["power_readings", "nvml_power", "nvml_energy"]
439-
for obs in observables:
440-
if obs not in supported:
441-
raise ValueError(f"Observable {obs} not in supported: {supported}")
442-
self.observables = observables
443-
444-
# duration in seconds
445-
self.continuous_duration = continous_duration
446-
447-
self.power = 0
448-
self.energy = 0
449-
self.power_readings = []
450-
self.t0 = 0
451-
452-
# results from the last iteration-based benchmark
453-
self.results = None
454-
455-
def before_start(self):
456-
self.parent.before_start()
457-
self.power = 0
458-
self.energy = 0
459-
self.power_readings = []
460-
461-
def after_start(self):
462-
self.parent.after_start()
463-
self.t0 = time.perf_counter()
464-
465-
def during(self):
466-
self.parent.during()
467-
power_usage = self.nvml.pwr_usage()
468-
timestamp = time.perf_counter() - self.t0
469-
# only store the result if we get a new measurement from NVML
470-
if len(self.power_readings) == 0 or (
471-
self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
472-
):
473-
self.power_readings.append([timestamp, power_usage])
474-
475-
def after_finish(self):
476-
self.parent.after_finish()
477-
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
478-
if not self.power_readings:
479-
return
480-
481-
# convert to seconds from milliseconds
482-
execution_time = self.results["time"] / 1e3
483-
self.power = np.median([d[1] / 1e3 for d in self.power_readings])
484-
self.energy = self.power * execution_time
485-
486-
def get_results(self):
487-
results = self.parent.get_results()
488-
keys = list(results.keys())
489-
for key in keys:
490-
results["pwr_" + key] = results.pop(key)
491-
if "nvml_energy" in self.observables:
492-
results["nvml_energy"] = self.energy
493-
if "nvml_power" in self.observables:
494-
results["nvml_power"] = self.power
495-
if "power_readings" in self.observables:
496-
results["power_readings"] = self.power_readings
497-
return results
498-
499-
500435
# High-level Helper functions
501436

502437

kernel_tuner/observers/observer.py

+73-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from abc import ABC, abstractmethod
2-
2+
import time
3+
import numpy as np
34

45
class BenchmarkObserver(ABC):
56
"""Base class for Benchmark Observers"""
@@ -44,8 +45,78 @@ class IterationObserver(BenchmarkObserver):
4445

4546

4647
class ContinuousObserver(BenchmarkObserver):
47-
pass
48+
"""Generic observer that measures power while and continuous benchmarking.
49+
50+
To support continuous benchmarking an Observer should support:
51+
a .read_power() method, which the ContinuousObserver can call to read power in Watt
52+
"""
53+
def __init__(self, name, observables, parent, continuous_duration=1):
54+
self.parent = parent
55+
self.name = name
56+
57+
supported = [self.name + "_power", self.name + "_energy", "power_readings"]
58+
for obs in observables:
59+
if obs not in supported:
60+
raise ValueError(f"Observable {obs} not in supported: {supported}")
61+
self.observables = observables
62+
63+
# duration in seconds
64+
self.continuous_duration = continuous_duration
65+
66+
self.power = 0
67+
self.energy = 0
68+
self.power_readings = []
69+
self.t0 = 0
70+
71+
# results from the last iteration-based benchmark
72+
# these are set by the benchmarking function of Kernel Tuner before
73+
# the continuous observer is called.
74+
self.results = None
75+
76+
def before_start(self):
77+
self.parent.before_start()
78+
self.power = 0
79+
self.energy = 0
80+
self.power_readings = []
81+
82+
def after_start(self):
83+
self.parent.after_start()
84+
self.t0 = time.perf_counter()
4885

86+
def during(self):
87+
self.parent.during()
88+
power_usage = self.parent.read_power()
89+
timestamp = time.perf_counter() - self.t0
90+
# only store the result if we get a new measurement from the GPU
91+
if len(self.power_readings) == 0 or (
92+
self.power_readings[-1][1] != power_usage
93+
or timestamp - self.power_readings[-1][0] > 0.01
94+
):
95+
self.power_readings.append([timestamp, power_usage])
96+
97+
def after_finish(self):
98+
self.parent.after_finish()
99+
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
100+
if not self.power_readings:
101+
return
102+
103+
# convert to seconds from milliseconds
104+
execution_time = self.results["time"] / 1e3
105+
self.power = np.median([d[1] for d in self.power_readings])
106+
self.energy = self.power * execution_time
107+
108+
def get_results(self):
109+
results = self.parent.get_results()
110+
keys = list(results.keys())
111+
for key in keys:
112+
results["pwr_" + key] = results.pop(key)
113+
if self.name + "_power" in self.observables:
114+
results[self.name + "_power"] = self.power
115+
if self.name + "_energy" in self.observables:
116+
results[self.name + "_energy"] = self.energy
117+
if "power_readings" in self.observables:
118+
results["power_readings"] = self.power_readings
119+
return results
49120

50121
class OutputObserver(BenchmarkObserver):
51122
"""Observer that can verify or measure something about the output produced by a kernel."""

0 commit comments

Comments
 (0)