Add TuningBudget class and modify runners to respect the budget

stijnh · stijnh · commit d18bfdfbe97c · 2026-02-02T09:23:14.000+01:00
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
@@ -620,11 +620,14 @@ def tune_kernel(
 
     # copy some values from strategy_options
     searchspace_construction_options = {}
+    max_fevals = None
+    time_limit = None
+
     if strategy_options:
         if "max_fevals" in strategy_options:
-            tuning_options["max_fevals"] = strategy_options["max_fevals"]
+            max_fevals = strategy_options["max_fevals"]
         if "time_limit" in strategy_options:
-            tuning_options["time_limit"] = strategy_options["time_limit"] 
+            time_limit = strategy_options["time_limit"] 
         if "searchspace_construction_options" in strategy_options:
             searchspace_construction_options = strategy_options["searchspace_construction_options"]         
 
@@ -703,14 +706,27 @@ def preprocess_cache(filepath):
         print(f"Searchspace has {searchspace.size} configurations after restrictions.")
 
     # register the times and raise an exception if the budget is exceeded
-    if "time_limit" in tuning_options:
-        tuning_options["startup_time"] = perf_counter() - start_overhead_time
-        if tuning_options["startup_time"] > tuning_options["time_limit"]:
+    startup_time = perf_counter() - start_overhead_time
+
+    if time_limit is not None:
+        if startup_time > time_limit:
             raise RuntimeError(
-                f"The startup time of the tuning process ({tuning_options['startup_time']} seconds) has exceeded the time limit ({tuning_options['time_limit']} seconds). "
+                f"The startup time of the tuning process ({startup_time} seconds) has exceeded the time limit ({time_limit} seconds). "
                 "Please increase the time limit or decrease the size of the search space."
             )
-    tuning_options["start_time"] = perf_counter()
+
+        time_limit -= startup_time
+
+    if max_fevals is None or max_fevals > searchspace.size:
+        logging.info(f"evaluation limit has been adjusted from {max_fevals} to {searchspace.size} (search space size)")
+        max_fevals = searchspace.size
+
+    # Create the budget. Add the time spent on startup to the budget
+    budget = util.TuningBudget(time_limit, max_fevals)
+    tuning_options["time_limit"] = time_limit  # TODO: Is this used?
+    tuning_options["max_fevals"] = max_fevals  # TODO: Is this used?
+    tuning_options["budget"] = budget
+
 
     # call the strategy to execute the tuning process
     results = strategy.tune(searchspace, runner, tuning_options)
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
@@ -3,10 +3,18 @@
 import logging
 import socket
 from time import perf_counter
+from typing import List, Optional
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.interface import Options
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.util import (
+    BudgetExceededConfig,
+    ErrorConfig,
+    TuningBudget,
+    print_config_output,
+    process_metrics,
+    store_cache,
+)
 from datetime import datetime, timezone
 
 logger = logging.getLogger(__name__)
@@ -213,31 +221,31 @@ def shutdown(self):
     def available_parallelism(self):
         return len(self.workers)
 
-    def submit_jobs(self, jobs):
+    def submit_jobs(self, jobs, budget: TuningBudget):
         pending_jobs = deque(jobs)
         running_jobs = []
 
-        while pending_jobs or running_jobs:
-            should_wait = True
+        while pending_jobs and not budget.is_done():
+            job_was_submitted = False
 
             # If there is still work left, submit it now
-            if pending_jobs:
-                for i, worker in enumerate(list(self.workers)):
-                    if worker.is_available():
-                        # Push worker to back of list
-                        self.workers.pop(i)
-                        self.workers.append(worker)
+            for i, worker in enumerate(list(self.workers)):
+                if worker.is_available():
+                    # Push worker to back of list
+                    self.workers.pop(i)
+                    self.workers.append(worker)
 
-                        # Pop job and submit it
-                        job = pending_jobs.popleft()
-                        ref = worker.submit(*job)
-                        running_jobs.append(ref)
+                    # Pop job and submit it
+                    key, config = pending_jobs.popleft()
+                    ref = worker.submit(key, config)
+                    running_jobs.append(ref)
 
-                        should_wait = False
-                        break
+                    job_was_submitted = True
+                    budget.add_evaluations(1)
+                    break
 
             # If no work was submitted, wait until a worker is available
-            if should_wait:
+            if not job_was_submitted:
                 if not running_jobs:
                     raise RuntimeError("invalid state: no ray workers available")
 
@@ -246,14 +254,28 @@ def submit_jobs(self, jobs):
                 for result in ready_jobs:
                     yield ray.get(result)
 
-    def run(self, parameter_space, tuning_options):
+        # If there are still pending jobs, then the budget has been exceeded.
+        # We return `None` to indicate that no result is available for these jobs.
+        while pending_jobs:
+            key, _ = pending_jobs.popleft()
+            yield (key, None)
+
+        # Wait until running jobs complete
+        while running_jobs:
+            ready_jobs, running_jobs = ray.wait(running_jobs, num_returns=1)
+
+            for result in ready_jobs:
+                yield ray.get(result)
+
+    def run(self, parameter_space, tuning_options) -> List[Optional[dict]]:
         metrics = tuning_options.metrics
         objective = tuning_options.objective
 
         jobs = []  # Jobs that need to be executed
         results = []  # Results that will be returned at the end
         key2index = dict()  # Used to insert job result back into `results`
-        duplicate_entries = []  # Used for duplicate entries in `parameter_space`
+
+        total_worker_time = 0
 
         # Select jobs which are not in the cache
         for index, config in enumerate(parameter_space):
@@ -262,28 +284,33 @@ def run(self, parameter_space, tuning_options):
 
             if key in tuning_options.cache:
                 params.update(tuning_options.cache[key])
-                params["compile_time"] = 0
-                params["verification_time"] = 0
-                params["benchmark_time"] = 0
+
+                # Simulate compile, verification, and benchmark time
+                tuning_options.budget.add_time_spent(params["compile_time"])
+                tuning_options.budget.add_time_spent(params["verification_time"])
+                tuning_options.budget.add_time_spent(params["benchmark_time"])
                 results.append(params)
             else:
-                if key not in key2index:
-                    key2index[key] = index
-                else:
-                    duplicate_entries.append((key2index[key], index))
+                assert key not in key2index, "duplicate jobs submitted"
+                key2index[key] = index
 
                 jobs.append((key, params))
                 results.append(None)
 
-        total_worker_time = 0
 
         # Submit jobs and wait for them to finish
-        for key, result in self.submit_jobs(jobs):
+        for key, result in self.submit_jobs(jobs, tuning_options.budget):
+            # `None` indicate that no result is available since the budget is exceeded.
+            # We can skip it, meaning that `results` contains `None`s for these entries
+            if result is None:
+                continue
+
+            # Store the result into the output array
             results[key2index[key]] = result
 
             # Collect total time spent by worker
             total_worker_time += (
-                params["compile_time"] + params["verification_time"] + params["benchmark_time"]
+                result["compile_time"] + result["verification_time"] + result["benchmark_time"]
             )
 
             if isinstance(result.get(objective), ErrorConfig):
@@ -300,10 +327,6 @@ def run(self, parameter_space, tuning_options):
             # add configuration to cache
             store_cache(key, result, tuning_options.cachefile, tuning_options.cache)
 
-        # Copy each `i` to `j` for every `i,j` in `duplicate_entries`
-        for i, j in duplicate_entries:
-            results[j] = dict(results[i])
-
         total_time = 1000 * (perf_counter() - self.start_time)
         self.start_time = perf_counter()
 
@@ -313,14 +336,20 @@ def run(self, parameter_space, tuning_options):
         runner_time = total_time - strategy_time
         framework_time = max(runner_time * len(self.workers) - total_worker_time, 0)
 
+        num_valid_results = sum(bool(r) for r in results) # Count the number of valid results
+
         # Post-process all the results
-        for params in results:
+        for result in results:
+            # Skip missing results
+            if not result:
+                continue
+
             # Amortize the time over all the results
-            params["strategy_time"] = strategy_time / len(results)
-            params["framework_time"] = framework_time / len(results)
+            result["strategy_time"] = strategy_time / num_valid_results
+            result["framework_time"] = framework_time / num_valid_results
 
             # only compute metrics on configs that have not errored
-            if metrics and not isinstance(params.get(objective), ErrorConfig):
-                params = process_metrics(params, metrics)
+            if not isinstance(result.get(objective), ErrorConfig):
+                result = process_metrics(result, metrics)
 
         return results
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
@@ -70,6 +70,7 @@ def run(self, parameter_space, tuning_options):
 
         # iterate over parameter space
         for element in parameter_space:
+            tuning_options.budget.add_evaluations(1)
             params = dict(zip(tuning_options.tune_params.keys(), element))
 
             if stop_criterion_reached(tuning_options):
@@ -82,9 +83,11 @@ def run(self, parameter_space, tuning_options):
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
                 params.update(tuning_options.cache[x_int])
-                params["compile_time"] = 0
-                params["verification_time"] = 0
-                params["benchmark_time"] = 0
+
+                # Simulate compile, verification, and benchmark time
+                tuning_options.budget.add_time_spent(params["compile_time"])
+                tuning_options.budget.add_time_spent(params["verification_time"])
+                tuning_options.budget.add_time_spent(params["benchmark_time"])
             else:
                 # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
                 if not self.warmed_up:
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
@@ -54,6 +54,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.kernel_options = kernel_options
 
         self.start_time = perf_counter()
+        self.total_simulated_time = 0
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.units = {}
@@ -64,7 +65,7 @@ def get_device_info(self):
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
         env["simulation"] = True
-        env["simulated_time"] = tuning_options.simulated_time
+        env["simulated_time"] = self.total_simulated_time
         return env
 
     def run(self, parameter_space, tuning_options):
@@ -89,55 +90,48 @@ def run(self, parameter_space, tuning_options):
         # iterate over parameter space
         for element in parameter_space:
 
-            if util.stop_criterion_reached(tuning_options):
-                return results
-
+            # Append `None` to indicate that the tuning budget has been exceeded
+            if tuning_options.budget.is_done():
+                results.append(None)
+                continue
+            
             # check if element is in the cache
-            x_int = ",".join([str(i) for i in element])
-            if tuning_options.cache and x_int in tuning_options.cache:
-                result = tuning_options.cache[x_int].copy()
+            key = ",".join([str(i) for i in element])
+
+            if key in tuning_options.cache:
+                # Get from cache and create a copy
+                result = dict(tuning_options.cache[key])
 
                 # only compute metrics on configs that have not errored
                 if tuning_options.metrics and not isinstance(result.get(tuning_options.objective), util.ErrorConfig):
                     result = util.process_metrics(result, tuning_options.metrics)
 
-                # Simulate behavior of sequential runner that when a configuration is
-                # served from the cache by the sequential runner, the compile_time,
-                # verification_time, and benchmark_time are set to 0.
-                # This step is only performed in the simulation runner when a configuration
-                # is served from the cache beyond the first timel. That is, when the
-                # configuration is already counted towards the unique_results.
-                # It is the responsibility of cost_func to add configs to unique_results.
-                if x_int in tuning_options.unique_results:
-                    result["compile_time"] = 0
-                    result["verification_time"] = 0
-                    result["benchmark_time"] = 0
-
-                else:
-                    # configuration is evaluated for the first time, print to the console
-                    util.print_config_output(
-                        tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
-                    )
+                # configuration is evaluated for the first time, print to the console
+                util.print_config_output(
+                    tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
+                )
 
                 # Everything but the strategy time and framework time are simulated,
                 result["strategy_time"] = strategy_time_per_config
 
+                # Simulate the evaluation of this configuration
+                tuning_options.budget.add_evaluations(1)
+                tuning_options.budget.add_time_spent(result["compile_time"])
+                tuning_options.budget.add_time_spent(result["verification_time"])
+                tuning_options.budget.add_time_spent(result["benchmark_time"])
+
                 try:
-                    simulated_time = result["compile_time"] + result["verification_time"] + result["benchmark_time"]
-                    tuning_options.simulated_time += simulated_time
+                    self.total_simulated_time += result["compile_time"] + result["verification_time"] + result["benchmark_time"]
                 except KeyError:
-                    if "time_limit" in tuning_options:
-                        raise RuntimeError(
-                            "Cannot use simulation mode with a time limit on a cache file that does not have full compile, verification, and benchmark timings on all configurations"
-                        )
+                    raise RuntimeError(
+                        "Cannot use simulation mode with a time limit on a cache file that does not have full compile, verification, and benchmark timings on all configurations"
+                    )
 
                 total_time = 1000 * (perf_counter() - self.start_time)
                 self.start_time = perf_counter()
                 result["framework_time"] = total_time
 
                 results.append(result)
-                if x_int not in tuning_options.unique_results:
-                    tuning_options.unique_results[x_int] = result
                 continue
 
             # if the configuration is not in the cache and not within restrictions, simulate an InvalidConfig with warning
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py