neuralmagic · markVaykhansky · May 18, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/README.md b/README.md
@@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
+- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
+
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
 - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
@@ -163,12 +163,23 @@ def cli():
         "If None, will run until max_seconds or the data is exhausted."
     ),
 )
+@click.option(
+    "--max-error-rate",
+    type=float,
+    help=(
+        "The maximum error rate after which a benchmark will stop. "
+        "Applicable only for finite deterministic scenarios i.e "
+        "rate_type is 'constant' and 'max_seconds' exists OR "
+        "'max_requests' exists OR the dataset is finite. "
+        "If None or not applicable, benchmarks will continue regardless of error rate."
+    ),
+)
 @click.option(
     "--warmup-percent",
     type=float,
     default=None,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, "
+        "The percent of the benchmark (based on max-seconds, max-requests, "
         "or lenth of dataset) to run as a warmup and not include in the final results. "
         "Defaults to None."
     ),
@@ -177,7 +188,7 @@ def cli():
     "--cooldown-percent",
     type=float,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+        "The percent of the benchmark (based on max-seconds, max-requests, or length "
         "of dataset) to run as a cooldown and not include in the final results. "
         "Defaults to None."
     ),
@@ -242,6 +253,7 @@ def benchmark(
     rate,
     max_seconds,
     max_requests,
+    max_error_rate,
     warmup_percent,
     cooldown_percent,
     disable_progress,
@@ -267,6 +279,7 @@ def benchmark(
             rate=rate,
             max_seconds=max_seconds,
             max_requests=max_requests,
+            max_error_rate=max_error_rate,
             warmup_percent=warmup_percent,
             cooldown_percent=cooldown_percent,
             show_progress=not disable_progress,

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
@@ -600,6 +600,8 @@ def compile(self) -> GenerativeBenchmark:
         """
         successful, incomplete, errored = self._compile_results()
 
+        error_rate = self._calculate_error_rate()
+
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
             successful=successful,
@@ -625,12 +627,19 @@ def compile(self) -> GenerativeBenchmark:
                 request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
                 request_time_delay_avg=self.requests_stats.request_time_delay.mean,
                 request_time_avg=self.requests_stats.request_time.mean,
+                error_rate=error_rate,
             ),
             worker=self.worker_description,
             requests_loader=self.request_loader_description,
             extras=self.extras,
         )
 
+    def _calculate_error_rate(self) -> float:
+        total_successful = self.requests_stats.totals.successful.total
+        total_errored = self.requests_stats.totals.errored.total
+        total_finished = total_errored + total_successful
+        return total_errored / total_finished
+
     def _compile_results(
         self,
     ) -> tuple[

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel):
     max_duration: Optional[float] = Field(
         description="The maximum duration in seconds to run this benchmark, if any."
     )
+    max_error_rate: Optional[float] = Field(
+        description="Maximum error rate after which a benchmark will stop."
+    )
     warmup_number: Optional[int] = Field(
         description=(
             "The number of requests to run for the warmup phase of this benchmark, "
@@ -213,6 +216,14 @@ class BenchmarkRunStats(StandardBaseModel):
             "it was completed."
         )
     )
+    error_rate: float = Field(
+        description=(
+            "The number of errored requests divided by the number "
+            "of errored requests. This can be higher than max_error_rate "
+            "(if applicable) cause it does not take into "
+            "account incomplete requests."
+        )
+    )
 
 
 class BenchmarkMetrics(StandardBaseModel):

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
@@ -74,6 +74,11 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         description="Maximum duration (in seconds) to process requests per strategy.",
         ge=0,
     )
+    max_error_rate: Optional[float] = Field(
+        description="Maximum error rate after which a benchmark will stop",
+        ge=0,
+        le=1,
+    )
     warmup_percent_per_strategy: Optional[float] = Field(
         description="Percentage of requests to use for warmup.",
         ge=0,
@@ -148,6 +153,7 @@ async def run(
         profile: Profile,
         max_number_per_strategy: Optional[int],
         max_duration_per_strategy: Optional[float],
+        max_error_rate: Optional[float],
         warmup_percent_per_strategy: Optional[float],
         cooldown_percent_per_strategy: Optional[float],
     ) -> AsyncGenerator[
@@ -162,6 +168,7 @@ async def run(
             requests_loader_size=requests_loader_size,
             max_number_per_strategy=max_number_per_strategy,
             max_duration_per_strategy=max_duration_per_strategy,
+            max_error_rate=max_error_rate,
             warmup_percent_per_strategy=warmup_percent_per_strategy,
             cooldown_percent_per_strategy=cooldown_percent_per_strategy,
         )
@@ -196,6 +203,7 @@ async def run(
                 scheduling_strategy=scheduling_strategy,
                 max_number=max_number_per_strategy,
                 max_duration=max_duration_per_strategy,
+                max_error_rate=max_error_rate,
             ):
                 if result.type_ == "run_start":
                     yield BenchmarkerResult(
@@ -321,6 +329,7 @@ def create_benchmark_aggregator(
                 strategy=strategy,
                 max_number=limits.max_number,
                 max_duration=limits.max_duration,
+                max_error_rate=limits.max_error_rate,
                 warmup_number=limits.warmup_number,
                 warmup_duration=limits.warmup_duration,
                 cooldown_number=limits.cooldown_number,

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
@@ -41,6 +41,7 @@ async def benchmark_generative_text(
     rate: Optional[Union[float, list[float]]],
     max_seconds: Optional[float],
     max_requests: Optional[int],
+    max_error_rate: Optional[float],
     warmup_percent: Optional[float],
     cooldown_percent: Optional[float],
     show_progress: bool,
@@ -107,6 +108,7 @@ async def benchmark_generative_text(
         profile=profile,
         max_number_per_strategy=max_requests,
         max_duration_per_strategy=max_seconds,
+        max_error_rate=max_error_rate,
         warmup_percent_per_strategy=warmup_percent,
         cooldown_percent_per_strategy=cooldown_percent,
     ):

diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
@@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str:
             {
                 "max_number": args.max_number,
                 "max_duration": args.max_duration,
+                "max_error_rate": args.max_error_rate,
                 "warmup_number": args.warmup_number,
                 "warmup_duration": args.warmup_duration,
                 "cooldown_number": args.cooldown_number,

diff --git a/src/guidellm/request/__init__.py b/src/guidellm/request/__init__.py
@@ -1,6 +1,7 @@
 from .loader import (
     GenerativeRequestLoader,
     GenerativeRequestLoaderDescription,
+    GetInfiniteDatasetLengthError,
     RequestLoader,
     RequestLoaderDescription,
 )
@@ -10,6 +11,7 @@
     "GenerationRequest",
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py
@@ -19,11 +19,16 @@
 __all__ = [
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
 
 
+class GetInfiniteDatasetLengthError(Exception):
+    pass
+
+
 class RequestLoaderDescription(StandardBaseModel):
     type_: Literal["request_loader"] = "request_loader"
 
@@ -120,7 +125,11 @@ def __len__(self) -> int:
         if self.iter_type == "finite":
             return self.num_unique_items()
 
-        raise ValueError(f"Unable to determine length of dataset: {self.data}")
+        if self.iter_type != "infinite":
+            raise ValueError(f"Invalid iter_type {self.iter_type}")
+        raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is "
+                                            f"infinite and thus "
+                                            f"unable to determine length")
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
@@ -46,12 +46,14 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
+    max_error_rate: Optional[float] = None
 
     created_requests: int = 0
     queued_requests: int = 0
     scheduled_requests: int = 0
     processing_requests: int = 0
     completed_requests: int = 0
+    errored_requests: int = 0
 
 
 class SchedulerRequestInfo(StandardBaseModel):