Skip to content

Commit b207e06

Browse files
committed
Merge branch 'master' into directives
2 parents 760462c + 6ebf773 commit b207e06

File tree

8 files changed

+506
-78
lines changed

8 files changed

+506
-78
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python
2+
"""This is the minimal example from the README"""
3+
4+
import json
5+
6+
import numpy
7+
from kernel_tuner import tune_kernel
8+
from kernel_tuner.observers.tegra import TegraObserver
9+
10+
def tune():
11+
12+
kernel_string = """
13+
__global__ void vector_add(float *c, float *a, float *b, int n) {
14+
int i = blockIdx.x * block_size_x + threadIdx.x;
15+
if (i<n) {
16+
c[i] = a[i] + b[i];
17+
}
18+
}
19+
"""
20+
21+
size = 800000
22+
23+
a = numpy.random.randn(size).astype(numpy.float32)
24+
b = numpy.random.randn(size).astype(numpy.float32)
25+
c = numpy.zeros_like(b)
26+
n = numpy.int32(size)
27+
28+
args = [c, a, b, n]
29+
30+
tune_params = dict()
31+
tune_params["block_size_x"] = [128+64*i for i in range(15)]
32+
33+
tegraobserver = TegraObserver(["core_freq"])
34+
35+
metrics = dict()
36+
metrics["f"] = lambda p: p["core_freq"]
37+
38+
results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[tegraobserver], metrics=metrics)
39+
40+
print(results)
41+
42+
return results
43+
44+
45+
if __name__ == "__main__":
46+
tune()

kernel_tuner/backends/hip.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"bool": ctypes.c_bool,
2020
"int8": ctypes.c_int8,
2121
"int16": ctypes.c_int16,
22+
"float16": ctypes.c_int16,
2223
"int32": ctypes.c_int32,
2324
"int64": ctypes.c_int64,
2425
"uint8": ctypes.c_uint8,

kernel_tuner/core.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@
2020
from kernel_tuner.backends.nvcuda import CudaFunctions
2121
from kernel_tuner.backends.opencl import OpenCLFunctions
2222
from kernel_tuner.backends.compiler import CompilerFunctions
23-
from kernel_tuner.backends.opencl import OpenCLFunctions
24-
from kernel_tuner.backends.hip import HipFunctions
2523
from kernel_tuner.observers.nvml import NVMLObserver
24+
from kernel_tuner.observers.tegra import TegraObserver
2625
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
2726

2827
try:
@@ -307,8 +306,9 @@ def __init__(
307306
)
308307
self.dev = dev
309308

310-
# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
309+
# look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
311310
self.use_nvml = False
311+
self.use_tegra = False
312312
self.continuous_observers = []
313313
self.output_observers = []
314314
self.prologue_observers = []
@@ -317,6 +317,9 @@ def __init__(
317317
if isinstance(obs, NVMLObserver):
318318
self.nvml = obs.nvml
319319
self.use_nvml = True
320+
if isinstance(obs, TegraObserver):
321+
self.tegra = obs.tegra
322+
self.use_tegra = True
320323
if hasattr(obs, "continuous_observer"):
321324
self.continuous_observers.append(obs.continuous_observer)
322325
if isinstance(obs, OutputObserver):
@@ -373,6 +376,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
373376
for obs in self.benchmark_observers:
374377
result.update(obs.get_results())
375378

379+
376380
def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
377381
"""Benchmark continuously for at least 'duration' seconds"""
378382
iterations = int(np.ceil(duration / (result["time"] / 1000)))
@@ -396,6 +400,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
396400
for obs in self.continuous_observers:
397401
result.update(obs.get_results())
398402

403+
399404
def set_nvml_parameters(self, instance):
400405
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
401406
if self.use_nvml:
@@ -410,6 +415,11 @@ def set_nvml_parameters(self, instance):
410415
if "nvml_mem_clock" in instance.params:
411416
self.nvml.mem_clock = instance.params["nvml_mem_clock"]
412417

418+
if self.use_tegra:
419+
if "tegra_gr_clock" in instance.params:
420+
self.tegra.gr_clock = instance.params["tegra_gr_clock"]
421+
422+
413423
def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
414424
"""Benchmark the kernel instance."""
415425
logging.debug("benchmark " + instance.name)

kernel_tuner/observers/nvml.py

Lines changed: 6 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def __init__(
323323
save_all=False,
324324
nvidia_smi_fallback=None,
325325
use_locked_clocks=False,
326-
continous_duration=1,
326+
continuous_duration=1,
327327
):
328328
"""Create an NVMLObserver."""
329329
if nvidia_smi_fallback:
@@ -355,7 +355,7 @@ def __init__(
355355
if any([obs in self.needs_power for obs in observables]):
356356
self.measure_power = True
357357
power_observables = [obs for obs in observables if obs in self.needs_power]
358-
self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration)
358+
self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)
359359

360360
# remove power observables
361361
self.observables = [obs for obs in observables if obs not in self.needs_power]
@@ -373,6 +373,10 @@ def __init__(
373373
self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]]
374374
self.iteration = {obs: [] for obs in self.during_obs}
375375

376+
def read_power(self):
377+
""" Return power in Watt """
378+
return self.nvml.pwr_usage() / 1e3
379+
376380
def before_start(self):
377381
# clear results of the observables for next measurement
378382
self.iteration = {obs: [] for obs in self.during_obs}
@@ -428,75 +432,6 @@ def get_results(self):
428432
return averaged_results
429433

430434

431-
class NVMLPowerObserver(ContinuousObserver):
432-
"""Observer that measures power using NVML and continuous benchmarking."""
433-
434-
def __init__(self, observables, parent, nvml_instance, continous_duration=1):
435-
self.parent = parent
436-
self.nvml = nvml_instance
437-
438-
supported = ["power_readings", "nvml_power", "nvml_energy"]
439-
for obs in observables:
440-
if obs not in supported:
441-
raise ValueError(f"Observable {obs} not in supported: {supported}")
442-
self.observables = observables
443-
444-
# duration in seconds
445-
self.continuous_duration = continous_duration
446-
447-
self.power = 0
448-
self.energy = 0
449-
self.power_readings = []
450-
self.t0 = 0
451-
452-
# results from the last iteration-based benchmark
453-
self.results = None
454-
455-
def before_start(self):
456-
self.parent.before_start()
457-
self.power = 0
458-
self.energy = 0
459-
self.power_readings = []
460-
461-
def after_start(self):
462-
self.parent.after_start()
463-
self.t0 = time.perf_counter()
464-
465-
def during(self):
466-
self.parent.during()
467-
power_usage = self.nvml.pwr_usage()
468-
timestamp = time.perf_counter() - self.t0
469-
# only store the result if we get a new measurement from NVML
470-
if len(self.power_readings) == 0 or (
471-
self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
472-
):
473-
self.power_readings.append([timestamp, power_usage])
474-
475-
def after_finish(self):
476-
self.parent.after_finish()
477-
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
478-
if not self.power_readings:
479-
return
480-
481-
# convert to seconds from milliseconds
482-
execution_time = self.results["time"] / 1e3
483-
self.power = np.median([d[1] / 1e3 for d in self.power_readings])
484-
self.energy = self.power * execution_time
485-
486-
def get_results(self):
487-
results = self.parent.get_results()
488-
keys = list(results.keys())
489-
for key in keys:
490-
results["pwr_" + key] = results.pop(key)
491-
if "nvml_energy" in self.observables:
492-
results["nvml_energy"] = self.energy
493-
if "nvml_power" in self.observables:
494-
results["nvml_power"] = self.power
495-
if "power_readings" in self.observables:
496-
results["power_readings"] = self.power_readings
497-
return results
498-
499-
500435
# High-level Helper functions
501436

502437

kernel_tuner/observers/observer.py

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from abc import ABC, abstractmethod
2-
2+
import time
3+
import numpy as np
34

45
class BenchmarkObserver(ABC):
56
"""Base class for Benchmark Observers"""
@@ -44,8 +45,78 @@ class IterationObserver(BenchmarkObserver):
4445

4546

4647
class ContinuousObserver(BenchmarkObserver):
47-
pass
48+
"""Generic observer that measures power while and continuous benchmarking.
49+
50+
To support continuous benchmarking an Observer should support:
51+
a .read_power() method, which the ContinuousObserver can call to read power in Watt
52+
"""
53+
def __init__(self, name, observables, parent, continuous_duration=1):
54+
self.parent = parent
55+
self.name = name
56+
57+
supported = [self.name + "_power", self.name + "_energy", "power_readings"]
58+
for obs in observables:
59+
if obs not in supported:
60+
raise ValueError(f"Observable {obs} not in supported: {supported}")
61+
self.observables = observables
62+
63+
# duration in seconds
64+
self.continuous_duration = continuous_duration
65+
66+
self.power = 0
67+
self.energy = 0
68+
self.power_readings = []
69+
self.t0 = 0
70+
71+
# results from the last iteration-based benchmark
72+
# these are set by the benchmarking function of Kernel Tuner before
73+
# the continuous observer is called.
74+
self.results = None
75+
76+
def before_start(self):
77+
self.parent.before_start()
78+
self.power = 0
79+
self.energy = 0
80+
self.power_readings = []
81+
82+
def after_start(self):
83+
self.parent.after_start()
84+
self.t0 = time.perf_counter()
4885

86+
def during(self):
87+
self.parent.during()
88+
power_usage = self.parent.read_power()
89+
timestamp = time.perf_counter() - self.t0
90+
# only store the result if we get a new measurement from the GPU
91+
if len(self.power_readings) == 0 or (
92+
self.power_readings[-1][1] != power_usage
93+
or timestamp - self.power_readings[-1][0] > 0.01
94+
):
95+
self.power_readings.append([timestamp, power_usage])
96+
97+
def after_finish(self):
98+
self.parent.after_finish()
99+
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
100+
if not self.power_readings:
101+
return
102+
103+
# convert to seconds from milliseconds
104+
execution_time = self.results["time"] / 1e3
105+
self.power = np.median([d[1] for d in self.power_readings])
106+
self.energy = self.power * execution_time
107+
108+
def get_results(self):
109+
results = self.parent.get_results()
110+
keys = list(results.keys())
111+
for key in keys:
112+
results["pwr_" + key] = results.pop(key)
113+
if self.name + "_power" in self.observables:
114+
results[self.name + "_power"] = self.power
115+
if self.name + "_energy" in self.observables:
116+
results[self.name + "_energy"] = self.energy
117+
if "power_readings" in self.observables:
118+
results["power_readings"] = self.power_readings
119+
return results
49120

50121
class OutputObserver(BenchmarkObserver):
51122
"""Observer that can verify or measure something about the output produced by a kernel."""

kernel_tuner/observers/pmt.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22

3-
from kernel_tuner.observers.observer import BenchmarkObserver
3+
from kernel_tuner.observers.observer import BenchmarkObserver, ContinuousObserver
44

55
# check if pmt is installed
66
try:
@@ -28,9 +28,25 @@ class PMTObserver(BenchmarkObserver):
2828
2929
:type observables: string,list/dictionary
3030
31+
32+
:param use_continuous_observer:
33+
Boolean to control whether or not to measure power/energy using
34+
Kernel Tuner's continuous benchmarking mode. This improves measurement
35+
accuracy when using internal power sensors, such as NVML or ROCM,
36+
which have limited sampling frequency and might return averages
37+
instead of instantaneous power readings. Default value: False.
38+
39+
:type use_continuous_observer: boolean
40+
41+
42+
:param continuous_duration:
43+
Number of seconds to measure continuously for.
44+
45+
:type continuous_duration: scalar
46+
3147
"""
3248

33-
def __init__(self, observable=None):
49+
def __init__(self, observable=None, use_continuous_observer=False, continuous_duration=1):
3450
if not pmt:
3551
raise ImportError("could not import pmt")
3652

@@ -54,6 +70,9 @@ def __init__(self, observable=None):
5470
self.begin_states = [None] * len(self.pms)
5571
self.initialize_results(self.pm_names)
5672

73+
if use_continuous_observer:
74+
self.continuous_observer = PMTContinuousObserver("pmt", [], self, continuous_duration=continuous_duration)
75+
5776
def initialize_results(self, pm_names):
5877
self.results = dict()
5978
for pm_name in pm_names:
@@ -82,3 +101,39 @@ def get_results(self):
82101
averages = {key: np.average(values) for key, values in self.results.items()}
83102
self.initialize_results(self.pm_names)
84103
return averages
104+
105+
106+
class PMTContinuousObserver(ContinuousObserver):
107+
"""Generic observer that measures power while and continuous benchmarking.
108+
109+
To support continuous benchmarking an Observer should support:
110+
a .read_power() method, which the ContinuousObserver can call to read power in Watt
111+
"""
112+
def before_start(self):
113+
""" Override default method in ContinuousObserver """
114+
pass
115+
116+
def after_start(self):
117+
self.parent.after_start()
118+
119+
def during(self):
120+
""" Override default method in ContinuousObserver """
121+
pass
122+
123+
def after_finish(self):
124+
self.parent.after_finish()
125+
126+
def get_results(self):
127+
average_kernel_execution_time_ms = self.results["time"]
128+
129+
averages = {key: np.average(values) for key, values in self.results.items()}
130+
self.parent.initialize_results(self.parent.pm_names)
131+
132+
# correct energy measurement, because current _energy number is collected over the entire duration
133+
# we estimate energy as the average power over the continuous duration times the kernel execution time
134+
for pm_name in self.parent.pm_names:
135+
energy_result_name = f"{pm_name}_energy"
136+
power_result_name = f"{pm_name}_power"
137+
averages[energy_result_name] = averages[power_result_name] * (average_kernel_execution_time_ms / 1e3)
138+
139+
return averages

0 commit comments

Comments
 (0)