diff --git a/README.md b/README.md index a46fd411..0988c70e 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted. +- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate. + - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results. - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results. diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index d81b7ddf..48ccaeed 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -163,12 +163,23 @@ def cli(): "If None, will run until max_seconds or the data is exhausted." ), ) +@click.option( + "--max-error-rate", + type=float, + help=( + "The maximum error rate after which a benchmark will stop. " + "Applicable only for finite deterministic scenarios i.e " + "rate_type is 'constant' and 'max_seconds' exists OR " + "'max_requests' exists OR the dataset is finite. " + "If None or not applicable, benchmarks will continue regardless of error rate." + ), +) @click.option( "--warmup-percent", type=float, default=None, help=( - "The percent of the benchmark (based on max-seconds, max-requets, " + "The percent of the benchmark (based on max-seconds, max-requests, " "or lenth of dataset) to run as a warmup and not include in the final results. " "Defaults to None." ), @@ -177,7 +188,7 @@ def cli(): "--cooldown-percent", type=float, help=( - "The percent of the benchmark (based on max-seconds, max-requets, or lenth " + "The percent of the benchmark (based on max-seconds, max-requests, or length " "of dataset) to run as a cooldown and not include in the final results. " "Defaults to None." ), @@ -242,6 +253,7 @@ def benchmark( rate, max_seconds, max_requests, + max_error_rate, warmup_percent, cooldown_percent, disable_progress, @@ -267,6 +279,7 @@ def benchmark( rate=rate, max_seconds=max_seconds, max_requests=max_requests, + max_error_rate=max_error_rate, warmup_percent=warmup_percent, cooldown_percent=cooldown_percent, show_progress=not disable_progress, diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py index 9943f169..cd725326 100644 --- a/src/guidellm/benchmark/aggregator.py +++ b/src/guidellm/benchmark/aggregator.py @@ -600,6 +600,8 @@ def compile(self) -> GenerativeBenchmark: """ successful, incomplete, errored = self._compile_results() + error_rate = self._calculate_error_rate() + return GenerativeBenchmark.from_stats( run_id=self.run_id, successful=successful, @@ -625,12 +627,19 @@ def compile(self) -> GenerativeBenchmark: request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean, request_time_delay_avg=self.requests_stats.request_time_delay.mean, request_time_avg=self.requests_stats.request_time.mean, + error_rate=error_rate, ), worker=self.worker_description, requests_loader=self.request_loader_description, extras=self.extras, ) + def _calculate_error_rate(self) -> float: + total_successful = self.requests_stats.totals.successful.total + total_errored = self.requests_stats.totals.errored.total + total_finished = total_errored + total_successful + return total_errored / total_finished + def _compile_results( self, ) -> tuple[ diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py index 4e2e09a3..40ffefba 100644 --- a/src/guidellm/benchmark/benchmark.py +++ b/src/guidellm/benchmark/benchmark.py @@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel): max_duration: Optional[float] = Field( description="The maximum duration in seconds to run this benchmark, if any." ) + max_error_rate: Optional[float] = Field( + description="Maximum error rate after which a benchmark will stop." + ) warmup_number: Optional[int] = Field( description=( "The number of requests to run for the warmup phase of this benchmark, " @@ -213,6 +216,14 @@ class BenchmarkRunStats(StandardBaseModel): "it was completed." ) ) + error_rate: float = Field( + description=( + "The number of errored requests divided by the number " + "of errored requests. This can be higher than max_error_rate " + "(if applicable) cause it does not take into " + "account incomplete requests." + ) + ) class BenchmarkMetrics(StandardBaseModel): diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 11b6d245..ecb721f7 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -74,6 +74,11 @@ class BenchmarkerStrategyLimits(StandardBaseModel): description="Maximum duration (in seconds) to process requests per strategy.", ge=0, ) + max_error_rate: Optional[float] = Field( + description="Maximum error rate after which a benchmark will stop", + ge=0, + le=1, + ) warmup_percent_per_strategy: Optional[float] = Field( description="Percentage of requests to use for warmup.", ge=0, @@ -148,6 +153,7 @@ async def run( profile: Profile, max_number_per_strategy: Optional[int], max_duration_per_strategy: Optional[float], + max_error_rate: Optional[float], warmup_percent_per_strategy: Optional[float], cooldown_percent_per_strategy: Optional[float], ) -> AsyncGenerator[ @@ -162,6 +168,7 @@ async def run( requests_loader_size=requests_loader_size, max_number_per_strategy=max_number_per_strategy, max_duration_per_strategy=max_duration_per_strategy, + max_error_rate=max_error_rate, warmup_percent_per_strategy=warmup_percent_per_strategy, cooldown_percent_per_strategy=cooldown_percent_per_strategy, ) @@ -196,6 +203,7 @@ async def run( scheduling_strategy=scheduling_strategy, max_number=max_number_per_strategy, max_duration=max_duration_per_strategy, + max_error_rate=max_error_rate, ): if result.type_ == "run_start": yield BenchmarkerResult( @@ -321,6 +329,7 @@ def create_benchmark_aggregator( strategy=strategy, max_number=limits.max_number, max_duration=limits.max_duration, + max_error_rate=limits.max_error_rate, warmup_number=limits.warmup_number, warmup_duration=limits.warmup_duration, cooldown_number=limits.cooldown_number, diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 2f6c7182..a5e4da3b 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -41,6 +41,7 @@ async def benchmark_generative_text( rate: Optional[Union[float, list[float]]], max_seconds: Optional[float], max_requests: Optional[int], + max_error_rate: Optional[float], warmup_percent: Optional[float], cooldown_percent: Optional[float], show_progress: bool, @@ -107,6 +108,7 @@ async def benchmark_generative_text( profile=profile, max_number_per_strategy=max_requests, max_duration_per_strategy=max_seconds, + max_error_rate=max_error_rate, warmup_percent_per_strategy=warmup_percent, cooldown_percent_per_strategy=cooldown_percent, ): diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py index 4847160d..33b1efc2 100644 --- a/src/guidellm/benchmark/output.py +++ b/src/guidellm/benchmark/output.py @@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str: { "max_number": args.max_number, "max_duration": args.max_duration, + "max_error_rate": args.max_error_rate, "warmup_number": args.warmup_number, "warmup_duration": args.warmup_duration, "cooldown_number": args.cooldown_number, diff --git a/src/guidellm/request/__init__.py b/src/guidellm/request/__init__.py index db3059cc..606fb897 100644 --- a/src/guidellm/request/__init__.py +++ b/src/guidellm/request/__init__.py @@ -1,6 +1,7 @@ from .loader import ( GenerativeRequestLoader, GenerativeRequestLoaderDescription, + GetInfiniteDatasetLengthError, RequestLoader, RequestLoaderDescription, ) @@ -10,6 +11,7 @@ "GenerationRequest", "GenerativeRequestLoader", "GenerativeRequestLoaderDescription", + "GetInfiniteDatasetLengthError", "RequestLoader", "RequestLoaderDescription", ] diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py index 50ab3cca..62bd17ea 100644 --- a/src/guidellm/request/loader.py +++ b/src/guidellm/request/loader.py @@ -19,11 +19,16 @@ __all__ = [ "GenerativeRequestLoader", "GenerativeRequestLoaderDescription", + "GetInfiniteDatasetLengthError", "RequestLoader", "RequestLoaderDescription", ] +class GetInfiniteDatasetLengthError(Exception): + pass + + class RequestLoaderDescription(StandardBaseModel): type_: Literal["request_loader"] = "request_loader" @@ -120,7 +125,11 @@ def __len__(self) -> int: if self.iter_type == "finite": return self.num_unique_items() - raise ValueError(f"Unable to determine length of dataset: {self.data}") + if self.iter_type != "infinite": + raise ValueError(f"Invalid iter_type {self.iter_type}") + raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is " + f"infinite and thus " + f"unable to determine length") @property def description(self) -> GenerativeRequestLoaderDescription: diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py index 0f12687f..f899f54a 100644 --- a/src/guidellm/scheduler/result.py +++ b/src/guidellm/scheduler/result.py @@ -46,12 +46,14 @@ class SchedulerRunInfo(StandardBaseModel): end_number: float processes: int strategy: SchedulingStrategy + max_error_rate: Optional[float] = None created_requests: int = 0 queued_requests: int = 0 scheduled_requests: int = 0 processing_requests: int = 0 completed_requests: int = 0 + errored_requests: int = 0 class SchedulerRequestInfo(StandardBaseModel): diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 06203827..4097cfed 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -5,6 +5,7 @@ import time from collections.abc import AsyncGenerator, Iterable, Iterator from concurrent.futures import ProcessPoolExecutor +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -15,6 +16,7 @@ from loguru import logger from guidellm.config import settings +from guidellm.request.loader import GetInfiniteDatasetLengthError from guidellm.scheduler.result import ( SchedulerRequestResult, SchedulerResult, @@ -64,12 +66,14 @@ def __init__( self.worker = worker self.request_loader = request_loader + self.error_rate: Optional[float] = None async def run( self, scheduling_strategy: SchedulingStrategy, max_number: Optional[int] = None, max_duration: Optional[float] = None, + max_error_rate: Optional[float] = None, ) -> AsyncGenerator[ Union[SchedulerResult, SchedulerRequestResult[RequestT, ResponseT]], None ]: @@ -98,20 +102,18 @@ async def run( :param max_duration: The maximum duration for the scheduling run. If None, then no limit is set and either the iterator must be exhaustible or the max_number must be set. + :param max_error_rate: The maximum error rate after which the + scheduler shuts down. + Only applicable in benchmarks with finite deterministic number of requests. + If None or not applicable then scheduler will continue regardless of errors. :return: An asynchronous generator that yields SchedulerResult objects. Each SchedulerResult object contains information about the request, the response, and the run information. """ - if scheduling_strategy is None or not isinstance( - scheduling_strategy, SchedulingStrategy - ): - raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") - - if max_number is not None and max_number < 1: - raise ValueError(f"Invalid max_number: {max_number}") - - if max_duration is not None and max_duration < 0: - raise ValueError(f"Invalid max_duration: {max_duration}") + self._validate_scheduler_params(scheduling_strategy, + max_duration, + max_error_rate, + max_number) with ( multiprocessing.Manager() as manager, @@ -120,11 +122,15 @@ async def run( ) as executor, ): requests_iter: Optional[Iterator[Any]] = None - futures, requests_queue, responses_queue = await self._start_processes( - manager, executor, scheduling_strategy - ) + futures, requests_queue, responses_queue, shutdown_event = \ + await self._start_processes( + manager, executor, scheduling_strategy, max_error_rate is not None) + if shutdown_event and shutdown_event.is_set(): + raise RuntimeError( + "shutdown_event is set before starting scheduling" + ) run_info, requests_iter, times_iter = self._run_setup( - futures, scheduling_strategy, max_number, max_duration + futures, scheduling_strategy, max_number, max_duration, max_error_rate ) yield SchedulerResult( type_="run_start", @@ -132,7 +138,8 @@ async def run( ) try: - while True: + max_error_rate_reached = False + while not max_error_rate_reached: # check errors and raise them for future in futures: if future.done() and (err := future.exception()) is not None: @@ -159,6 +166,17 @@ async def run( run_info, ) if iter_result is not None: + if iter_result.request_info.errored \ + and not iter_result.request_info.canceled \ + and self._is_max_error_rate_reached(iter_result.run_info): + if shutdown_event is None: + raise RuntimeError("We've reached max_error_rate " + "but shutdown_event is corrupt") + shutdown_event.set() + max_error_rate_reached = True + logger.info(f"Max error rate of " + f"({iter_result.run_info.max_error_rate}) " + f"reached, sending shutdown signal") yield iter_result # yield control to the event loop @@ -173,17 +191,46 @@ async def run( await self._stop_processes(futures, requests_queue) + def _validate_scheduler_params( + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_error_rate: Optional[float], + max_number: Optional[int] + ) -> None: + if scheduling_strategy is None or not isinstance( + scheduling_strategy, SchedulingStrategy + ): + raise ValueError(f"Invalid scheduling strategy: {scheduling_strategy}") + if max_number is not None and max_number < 1: + raise ValueError(f"Invalid max_number: {max_number}") + if max_duration is not None and max_duration < 0: + raise ValueError(f"Invalid max_duration: {max_duration}") + if max_error_rate is not None and (max_error_rate < 0 or max_error_rate > 1): + raise ValueError(f"Invalid max_error_rate: {max_error_rate}") + + def _is_max_error_rate_reached(self, run_info: SchedulerRunInfo) -> bool: + if run_info.max_error_rate is None: + return False + current_error_rate = run_info.errored_requests / run_info.end_number + logger.info(f"Current error rate {current_error_rate} " + f"i.e total_finished [success / error] / max total possible") + return run_info.max_error_rate < current_error_rate + async def _start_processes( self, manager, executor: ProcessPoolExecutor, scheduling_strategy: SchedulingStrategy, + create_shutdown_event: bool = False ) -> tuple[ list[asyncio.Future], multiprocessing.Queue, multiprocessing.Queue, + Optional[MultiprocessingEvent] ]: await self.worker.prepare_multiprocessing() + shutdown_event = manager.Event() if create_shutdown_event else None requests_queue = manager.Queue( maxsize=scheduling_strategy.queued_requests_limit ) @@ -220,6 +267,7 @@ async def _start_processes( requests_queue, responses_queue, id_, + shutdown_event, ) ) elif scheduling_strategy.processing_mode == "async": @@ -231,6 +279,7 @@ async def _start_processes( responses_queue, requests_limit, id_, + shutdown_event, ) ) else: @@ -241,7 +290,7 @@ async def _start_processes( await asyncio.sleep(0.1) # give time for processes to start - return futures, requests_queue, responses_queue + return futures, requests_queue, responses_queue, shutdown_event def _run_setup( self, @@ -249,20 +298,19 @@ def _run_setup( scheduling_strategy: SchedulingStrategy, max_number: Optional[int], max_duration: Optional[float], + max_error_rate: Optional[float], ) -> tuple[SchedulerRunInfo, Iterator[Any], Iterator[float]]: requests_iter = iter(self.request_loader) start_time = time.time() times_iter = iter(scheduling_strategy.request_times()) end_time = time.time() + (max_duration or math.inf) - end_number = max_number or math.inf + end_number = self._determine_total_requests_count( + scheduling_strategy, max_duration, max_number + ) - try: - # update end number if the request loader is finite and less than max - iter_length = len(self.request_loader) # type: ignore[arg-type] - if 0 < iter_length < end_number: - end_number = iter_length - except Exception: # noqa: BLE001, S110 - pass + if end_number == math.inf and max_error_rate is not None: + logger.warning("max_error_rate will be ignored " + "because end_number can not be determined.") if end_number == math.inf and end_time is None: logger.warning( @@ -276,10 +324,37 @@ def _run_setup( end_number=end_number, processes=len(processes), strategy=scheduling_strategy, + max_error_rate=max_error_rate ) return info, requests_iter, times_iter + def _determine_total_requests_count( + self, + scheduling_strategy: SchedulingStrategy, + max_duration: Optional[float], + max_number: Optional[int], + ) -> Union[int, float]: + end_number = max_number or math.inf + try: + # update end_number if the request_loader is finite and less than max_number + iter_length = len(self.request_loader) # type: ignore[arg-type] + if 0 < iter_length < end_number: + end_number = iter_length + except GetInfiniteDatasetLengthError: + # Only when RPS is constant and duration is + # capped we can determine the total amount of requests + # that are supposed to be sent + if scheduling_strategy.type_ == "constant" and max_duration is not None: + total_requests_in_max_duration = int( + scheduling_strategy.rate * max_duration + ) + if 0 < total_requests_in_max_duration < end_number: + end_number = total_requests_in_max_duration + except Exception: # noqa: BLE001, S110 + pass + return end_number + def _add_requests( self, requests_iter: Optional[Iterator[Any]], @@ -362,6 +437,9 @@ def _check_result_ready( run_info.processing_requests -= 1 run_info.completed_requests += 1 + if process_response.info.errored: + run_info.errored_requests += 1 + return SchedulerRequestResult( type_="request_complete", run_info=run_info, @@ -379,4 +457,5 @@ async def _stop_processes( for _ in futures: requests_queue.put(None) + logger.debug("Waiting for futures to shut down") await asyncio.gather(*futures) diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index a53b14c2..f37b7708 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -1,11 +1,13 @@ import asyncio import math -import multiprocessing import multiprocessing.queues +import queue import time from abc import ABC, abstractmethod from collections.abc import AsyncGenerator from dataclasses import dataclass +from datetime import timedelta +from multiprocessing.synchronize import Event as MultiprocessingEvent from typing import ( Any, Generic, @@ -121,9 +123,31 @@ async def resolve( ... async def get_request( - self, requests_queue: multiprocessing.Queue + self, requests_queue: multiprocessing.Queue, + shutdown_event: Optional[MultiprocessingEvent] = None, + process_id: Optional[int] = None, ) -> Optional[WorkerProcessRequest[RequestT]]: - return await asyncio.to_thread(requests_queue.get) # type: ignore[attr-defined] + if shutdown_event is not None and process_id is None: + logger.warning("shutdown_event is not None and process_id " + "is None which makes it hard to debug") + + def _get_queue_intermittently(): + if shutdown_event is None: + raise ValueError("Shouldn't use _get_queue_intermittently " + "if there's no shutdown_even") + while True: + try: + get_timeout = timedelta(seconds=1).total_seconds() + return requests_queue.get(timeout=get_timeout) + except queue.Empty: + if shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + return None + + get_method = _get_queue_intermittently \ + if shutdown_event is not None \ + else requests_queue.get + return await asyncio.to_thread(get_method) # type: ignore[attr-defined] async def send_result( self, @@ -141,6 +165,7 @@ async def resolve_scheduler_request( timeout_time: float, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None, ): info = SchedulerRequestInfo( targeted_start_time=start_time, @@ -149,25 +174,41 @@ async def resolve_scheduler_request( scheduled_time=time.time(), process_id=process_id, ) - result: WorkerProcessResult[RequestT, ResponseT] = WorkerProcessResult( + request_scheduled_result: WorkerProcessResult[RequestT, ResponseT] = \ + WorkerProcessResult( type_="request_scheduled", request=request, response=None, info=info, ) - asyncio.create_task(self.send_result(results_queue, result)) + asyncio.create_task(self.send_result(results_queue, request_scheduled_result)) if (wait_time := start_time - time.time()) > 0: - await asyncio.sleep(wait_time) + if shutdown_event is None: + await asyncio.sleep(wait_time) + else: + shutdown_signal_received = \ + await self._sleep_intermittently_until_timestamp_or_shutdown( + sleep_until_timestamp=start_time, + shutdown_event=shutdown_event, + ) + if shutdown_signal_received: + logger.info( + "Received shutdown signal " + "while waiting to start " + f"|| Process ID {process_id}" + ) + return info.worker_start = time.time() - result = WorkerProcessResult( + request_start_result: WorkerProcessResult[RequestT, ResponseT] = \ + WorkerProcessResult( type_="request_start", request=request, response=None, info=info, ) - asyncio.create_task(self.send_result(results_queue, result)) + asyncio.create_task(self.send_result(results_queue, request_start_result)) status, response = await self.resolve(request, timeout_time) info.worker_end = time.time() @@ -185,16 +226,40 @@ async def resolve_scheduler_request( ) asyncio.create_task(self.send_result(results_queue, result)) + async def _sleep_intermittently_until_timestamp_or_shutdown( + self, + sleep_until_timestamp: float, + shutdown_event: MultiprocessingEvent, + ) -> bool: + delta = timedelta(seconds=10).total_seconds() + while time.time() < sleep_until_timestamp: + await asyncio.sleep(delta) + if shutdown_event.is_set(): + return True + return False + def process_loop_synchronous( self, requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None, ): async def _process_runner(): while ( - process_request := await self.get_request(requests_queue) + process_request := await self.get_request( + requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id, + ) ) is not None: + if shutdown_event and shutdown_event.is_set(): + logger.error("This shouldn't happen! " + "We should catch the " + "shutdown in the get wrapper") + logger.info(f"Shutdown signal received in future {process_id}") + break + dequeued_time = time.time() await self.resolve_scheduler_request( @@ -205,6 +270,7 @@ async def _process_runner(): timeout_time=process_request.timeout_time, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) try: @@ -222,6 +288,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None, ): async def _process_runner(): pending = asyncio.Semaphore(max_concurrency) @@ -230,16 +297,39 @@ async def _process_runner(): raise ValueError("Async worker called with max_concurrency < 1") while ( - process_request := await self.get_request(requests_queue) + process_request := await self.get_request( + requests_queue=requests_queue, + shutdown_event=shutdown_event, + process_id=process_id) ) is not None: + if shutdown_event and shutdown_event.is_set(): + logger.error("This shouldn't happen! " + "We should catch the " + "shutdown in the get wrapper") + logger.info(f"Shutdown signal received" + f" in future {process_id}") + break + dequeued_time = time.time() + logger.debug(f"Dequeued Process ID {process_id} || " + f"Timestamp {dequeued_time} || " + f"Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 await pending.acquire() + lock_acquired_at = time.time() + logger.debug(f"Lock acquired Process ID {process_id} ||" + f" Timestamp {lock_acquired_at} ||" + f" Semaphore {pending._value}/{max_concurrency}") # noqa: SLF001 + def _task_done(_: asyncio.Task): nonlocal pending pending.release() + if shutdown_event and shutdown_event.is_set(): + logger.info(f"Shutdown signal received in future {process_id}") + pending.release() + break task = asyncio.create_task( self.resolve_scheduler_request( request=process_request.request, @@ -249,6 +339,7 @@ def _task_done(_: asyncio.Task): timeout_time=process_request.timeout_time, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) ) task.add_done_callback(_task_done) @@ -314,12 +405,14 @@ def process_loop_synchronous( requests_queue: multiprocessing.Queue, results_queue: multiprocessing.Queue, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None ): asyncio.run(self.backend.validate()) super().process_loop_synchronous( requests_queue=requests_queue, results_queue=results_queue, process_id=process_id, + shutdown_event=shutdown_event, ) def process_loop_asynchronous( @@ -328,6 +421,7 @@ def process_loop_asynchronous( results_queue: multiprocessing.Queue, max_concurrency: int, process_id: int, + shutdown_event: Optional[MultiprocessingEvent] = None ): asyncio.run(self.backend.validate()) super().process_loop_asynchronous( @@ -335,6 +429,7 @@ def process_loop_asynchronous( results_queue=results_queue, max_concurrency=max_concurrency, process_id=process_id, + shutdown_event=shutdown_event, ) async def resolve( @@ -375,7 +470,7 @@ async def resolve( request_func, request_kwargs = self._create_request_func_kwargs(request) async def _runner(): - # wrap function so we can enforce timeout and + # wrap function so that we can enforce timeout and # still return the latest state from the backend async for resp in request_func(**request_kwargs): # type: ignore[operator] nonlocal response diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index 9076834b..e3114491 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -113,7 +113,7 @@ def test_console_benchmarks_args_str(): mock_benchmark = mock_generative_benchmark() console.benchmarks = [mock_benchmark] assert console.benchmarks_args_str == ( - "max_number=None, max_duration=10.0, warmup_number=None, " + "max_number=None, max_duration=10.0, max_error_rate=0.05, warmup_number=None, " "warmup_duration=None, cooldown_number=None, cooldown_duration=None" ) diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 81364fa1..3c360c68 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -221,6 +221,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: strategy=SynchronousStrategy(), max_number=None, max_duration=10.0, + max_error_rate=0.05, warmup_number=None, warmup_duration=None, cooldown_number=None, @@ -245,6 +246,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: request_start_time_targeted_delay_avg=1.2827096836907523, request_time_delay_avg=0.0004316908972603934, request_time_avg=1.426228676523481, + error_rate=0.345346, ), worker=GenerativeRequestsWorkerDescription( backend_type="openai_http",