diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py index 1fbc616ff..4937699e9 100644 --- a/tuner/tuner/libtuner.py +++ b/tuner/tuner/libtuner.py @@ -144,6 +144,9 @@ class BenchmarkResult: time: float device_id: str + def is_valid(self) -> bool: + return math.isfinite(self.time) + def unit_to_microseconds(real_time: float, time_unit: str) -> float: unit_conversions = { @@ -750,7 +753,9 @@ def collision_handler(index_hash_list: list[tuple[int, str]]) -> tuple[bool, lis return collision_detected, unique_indexes -def benchmark_candidates(candidate_indices, devices, tuning_client, candidate_trackers): +def benchmark_candidates( + candidate_indices, devices, tuning_client, candidate_trackers +) -> list[BenchmarkResult]: """ Runs the benchmarking for a given list of candidate indices. """ @@ -807,22 +812,24 @@ def __init__(self) -> None: ) def add_run(self, results: list[BenchmarkResult]) -> None: + if not BaselineResultHandler.are_baseline_devices_unique(results): + logging.warning( + "Duplicate device IDs detected in the first baseline results." + ) for result in results: self.device_baseline_results[result.device_id].append(result) - def are_baseline_devices_unique(self, results: list[BenchmarkResult]) -> bool: + @staticmethod + def are_baseline_devices_unique(results: list[BenchmarkResult]) -> bool: return len(results) == len(set(result.device_id for result in results)) def get_valid_time_ms(self, device_id: str) -> list[float]: return [ result.time for result in self.device_baseline_results.get(device_id, []) - if math.isfinite(result.time) + if result.is_valid() ] - def num_successful_runs(self, device_id: str) -> int: - return len(self.get_valid_time_ms(device_id)) - def get_average_result_ms(self, device_id: str) -> Optional[float]: valid_times = self.get_valid_time_ms(device_id) if valid_times: @@ -834,30 +841,24 @@ def detect_regressions( baseline_results: list[BenchmarkResult], threshold: float = 1.03, ) -> list[str]: + """ + Return a list of device IDs where regressions were detected. + """ regressions = [] for result in baseline_results: - if not math.isfinite(result.time): + if not result.is_valid(): continue baseline_avg = self.get_average_result_ms(result.device_id) if baseline_avg is not None and result.time > baseline_avg * threshold: regressions.append(result.device_id) - logging.warning( - f"Performance regression detected on device {result.device_id}: " - f"Stored average baseline time = {baseline_avg:.2f} ms, " - f"New baseline time = {result.time:.2f} ms, " - f"Slower by {((result.time - baseline_avg) / baseline_avg) * 100:.2f}%" - ) return regressions def is_valid(self) -> bool: """ Check if there are any valid finite baseline time recorded. - Return True if at least a valid (finite) baseline time recorded, - otherwise False. - This method determines whether the baseline data is available for computations - such as calculating speedup. + Return True iff at least a valid (finite) baseline time recorded. """ return any( self.get_valid_time_ms(device_id) @@ -865,7 +866,7 @@ def is_valid(self) -> bool: ) def is_valid_for_device(self, device_id: str) -> bool: - return bool(self.get_valid_time_ms(device_id)) + return len(self.get_valid_time_ms(device_id)) != 0 def calculate_speedup( self, candidate_results: list[BenchmarkResult] @@ -876,18 +877,18 @@ def calculate_speedup( """ if not self.is_valid(): logging.warning("No valid baseline times available.") - # Use the candidate time directly when no baselines are available + # Use the candidate time directly when no baselines are available. return { candidate.candidate_id: candidate.time for candidate in candidate_results } - # Calculate the fallback baseline as the average of all valid times across devices + # Calculate the fallback baseline as the average of all valid times across devices. valid_baseline_times = [ result.time for device_id in self.device_baseline_results for result in self.device_baseline_results[device_id] - if math.isfinite(result.time) + if result.is_valid() ] fallback_baseline = sum(valid_baseline_times) / len(valid_baseline_times) @@ -1017,10 +1018,7 @@ def benchmark( baseline_handler = BaselineResultHandler() baseline_handler.add_run(first_baseline_result) if not baseline_handler.is_valid(): - logging.warning("Baseline result is not valid after first run") - - if not baseline_handler.are_baseline_devices_unique(first_baseline_result): - logging.warning("Duplicate device IDs detected in the first baseline results.") + logging.warning("Baseline run failed.") candidate_indices = [i for i in compiled_candidates if i != 0] candidate_results = benchmark_candidates( @@ -1039,26 +1037,27 @@ def benchmark( regression_devices = baseline_handler.detect_regressions(second_baseline_result) if regression_devices: logging.warning( - f"Performance regressions detected for the following devices: {', '.join(regression_devices)}" + f"Performance regressions detected for the following devices: {', '.join(regression_devices)}." ) baseline_handler.add_run(second_baseline_result) if not baseline_handler.is_valid(): - logging.warning("Baseline result is not valid after second run") - - if not baseline_handler.are_baseline_devices_unique(second_baseline_result): - logging.warning("Duplicate device IDs detected in the second baseline results.") + logging.warning("Baseline run failed.") speedup_result = baseline_handler.calculate_speedup(candidate_results) # If the baseline is valid (`baseline_handler.is_valid()`), `speedup_result` represents the speedup values. # Otherwise, `speedup_result` contains the raw time values. top_candidates = baseline_handler.get_top_candidates(speedup_result, num_candidates) if baseline_handler.is_valid(): + candidate_time_map = { + result.candidate_id: result.time for result in candidate_results + } for candidate_id in top_candidates: speedup_value = speedup_result[candidate_id] + actual_time = candidate_time_map[candidate_id] percentage_of_baseline = speedup_value * 100 logging.info( - f"Candidate {candidate_id} time: {speedup_value:.2f} ms " + f"Candidate {candidate_id} time: {actual_time:.2f} ms " f"({percentage_of_baseline:.1f}% of baseline)" ) else: diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py index cec8045d1..5dde3a752 100644 --- a/tuner/tuner/libtuner_test.py +++ b/tuner/tuner/libtuner_test.py @@ -199,8 +199,8 @@ def test_baseline_result_handler_valid(): libtuner.BenchmarkResult(0, math.inf, "hip://1"), libtuner.BenchmarkResult(0, 0.7, "hip://0"), ] - assert handler.are_baseline_devices_unique([]) - assert not handler.are_baseline_devices_unique(baseline) + assert libtuner.BaselineResultHandler.are_baseline_devices_unique([]) + assert not libtuner.BaselineResultHandler.are_baseline_devices_unique(baseline) handler.add_run(baseline) assert handler.is_valid() assert handler.is_valid_for_device("hip://0") @@ -214,9 +214,9 @@ def test_baseline_result_handler_valid(): libtuner.BenchmarkResult(0, math.inf, "hip://1"), ] - assert handler.num_successful_runs("hip://0") == 2 - assert handler.num_successful_runs("hip://1") == 0 - assert handler.num_successful_runs("hip://2") == 0 + assert handler.get_valid_time_ms("hip://0") == [0.5, 0.7] + assert handler.get_valid_time_ms("hip://1") == [] + assert handler.get_valid_time_ms("hip://2") == [] additional_baseline = [ libtuner.BenchmarkResult(0, math.inf, "hip://1"), @@ -224,10 +224,12 @@ def test_baseline_result_handler_valid(): libtuner.BenchmarkResult(0, 1.2, "hip://1"), libtuner.BenchmarkResult(0, 0.8, "hip://1"), ] - assert not handler.are_baseline_devices_unique(additional_baseline) + assert not libtuner.BaselineResultHandler.are_baseline_devices_unique( + additional_baseline + ) handler.add_run(additional_baseline) - assert handler.num_successful_runs("hip://0") == 2 - assert handler.num_successful_runs("hip://0") == 2 + assert handler.get_valid_time_ms("hip://0") == [0.5, 0.7] + assert handler.get_valid_time_ms("hip://1") == [1.2, 0.8] assert handler.is_valid_for_device("hip://1") assert handler.get_average_result_ms("hip://0") == 0.6