Skip to content

Feat/max error rate #171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b7638b0
wip // max error rate in scheduler
markVaykhansky May 18, 2025
6059af1
wip
markVaykhansky May 19, 2025
69a5c9e
Revert "wip"
markVaykhansky May 19, 2025
7795d2c
Handle infinite datasets with constant rate
markVaykhansky May 19, 2025
6d688f0
minor bug fixes
markVaykhansky May 19, 2025
ede651a
bugfix / last request not yielded
markVaykhansky May 21, 2025
a17117c
Add max error rate to readme, CLI & report
markVaykhansky May 21, 2025
34cb6b6
make max_error_rate optional
markVaykhansky May 21, 2025
6289c07
minor fixes
markVaykhansky May 21, 2025
d5ee018
reprot error rate bugfix
markVaykhansky May 21, 2025
ce13ef7
add current error rate log
markVaykhansky May 21, 2025
9a68a76
remove todo
markVaykhansky May 21, 2025
6dd313d
Fix tests
markVaykhansky May 21, 2025
3697b30
Pre CR fixes
markVaykhansky May 21, 2025
2fe64c7
CR Fixes
markVaykhansky May 21, 2025
b54ab14
Lint fixes
markVaykhansky May 21, 2025
b502c94
Lint fixes
markVaykhansky May 21, 2025
332ef08
better var name
May 21, 2025
c2fd813
Type fixes, typos & bugfixes
markVaykhansky May 22, 2025
4bda8cf
Remove spammy log + bugfix
markVaykhansky May 22, 2025
4b857f1
Merge remote-tracking branch 'upstream/feat/max-error-rate' into feat…
markVaykhansky May 22, 2025
0d89a39
Merge remote-tracking branch 'origin/main' into feat/max-error-rate
markVaykhansky May 22, 2025
26319a5
Sleep interminetly
markVaykhansky May 22, 2025
09925a4
Add missing error log
markVaykhansky May 22, 2025
fa56258
linting fixes
markVaykhansky May 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative

- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.

- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.

- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.

- `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
Expand Down
17 changes: 15 additions & 2 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,23 @@ def cli():
"If None, will run until max_seconds or the data is exhausted."
),
)
@click.option(
"--max-error-rate",
type=float,
help=(
"The maximum error rate after which a benchmark will stop. "
"Applicable only for finite deterministic scenarios i.e "
"rate_type is 'constant' and 'max_seconds' exists OR "
"'max_requests' exists OR the dataset is finite. "
"If None or not applicable, benchmarks will continue regardless of error rate."
),
)
@click.option(
"--warmup-percent",
type=float,
default=None,
help=(
"The percent of the benchmark (based on max-seconds, max-requets, "
"The percent of the benchmark (based on max-seconds, max-requests, "
"or lenth of dataset) to run as a warmup and not include in the final results. "
"Defaults to None."
),
Expand All @@ -177,7 +188,7 @@ def cli():
"--cooldown-percent",
type=float,
help=(
"The percent of the benchmark (based on max-seconds, max-requets, or lenth "
"The percent of the benchmark (based on max-seconds, max-requests, or length "
"of dataset) to run as a cooldown and not include in the final results. "
"Defaults to None."
),
Expand Down Expand Up @@ -242,6 +253,7 @@ def benchmark(
rate,
max_seconds,
max_requests,
max_error_rate,
warmup_percent,
cooldown_percent,
disable_progress,
Expand All @@ -267,6 +279,7 @@ def benchmark(
rate=rate,
max_seconds=max_seconds,
max_requests=max_requests,
max_error_rate=max_error_rate,
warmup_percent=warmup_percent,
cooldown_percent=cooldown_percent,
show_progress=not disable_progress,
Expand Down
9 changes: 9 additions & 0 deletions src/guidellm/benchmark/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,8 @@ def compile(self) -> GenerativeBenchmark:
"""
successful, incomplete, errored = self._compile_results()

error_rate = self._calculate_error_rate()

return GenerativeBenchmark.from_stats(
run_id=self.run_id,
successful=successful,
Expand All @@ -625,12 +627,19 @@ def compile(self) -> GenerativeBenchmark:
request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
request_time_delay_avg=self.requests_stats.request_time_delay.mean,
request_time_avg=self.requests_stats.request_time.mean,
error_rate=error_rate,
),
worker=self.worker_description,
requests_loader=self.request_loader_description,
extras=self.extras,
)

def _calculate_error_rate(self) -> float:
total_successful = self.requests_stats.totals.successful.total
total_errored = self.requests_stats.totals.errored.total
total_finished = total_errored + total_successful
return total_errored / total_finished

def _compile_results(
self,
) -> tuple[
Expand Down
11 changes: 11 additions & 0 deletions src/guidellm/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel):
max_duration: Optional[float] = Field(
description="The maximum duration in seconds to run this benchmark, if any."
)
max_error_rate: Optional[float] = Field(
description="Maximum error rate after which a benchmark will stop."
)
warmup_number: Optional[int] = Field(
description=(
"The number of requests to run for the warmup phase of this benchmark, "
Expand Down Expand Up @@ -213,6 +216,14 @@ class BenchmarkRunStats(StandardBaseModel):
"it was completed."
)
)
error_rate: float = Field(
description=(
"The number of errored requests divided by the number "
"of errored requests. This can be higher than max_error_rate "
"(if applicable) cause it does not take into "
"account incomplete requests."
)
)


class BenchmarkMetrics(StandardBaseModel):
Expand Down
9 changes: 9 additions & 0 deletions src/guidellm/benchmark/benchmarker.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
description="Maximum duration (in seconds) to process requests per strategy.",
ge=0,
)
max_error_rate: Optional[float] = Field(
description="Maximum error rate after which a benchmark will stop",
ge=0,
le=1,
)
warmup_percent_per_strategy: Optional[float] = Field(
description="Percentage of requests to use for warmup.",
ge=0,
Expand Down Expand Up @@ -148,6 +153,7 @@ async def run(
profile: Profile,
max_number_per_strategy: Optional[int],
max_duration_per_strategy: Optional[float],
max_error_rate: Optional[float],
warmup_percent_per_strategy: Optional[float],
cooldown_percent_per_strategy: Optional[float],
) -> AsyncGenerator[
Expand All @@ -162,6 +168,7 @@ async def run(
requests_loader_size=requests_loader_size,
max_number_per_strategy=max_number_per_strategy,
max_duration_per_strategy=max_duration_per_strategy,
max_error_rate=max_error_rate,
warmup_percent_per_strategy=warmup_percent_per_strategy,
cooldown_percent_per_strategy=cooldown_percent_per_strategy,
)
Expand Down Expand Up @@ -196,6 +203,7 @@ async def run(
scheduling_strategy=scheduling_strategy,
max_number=max_number_per_strategy,
max_duration=max_duration_per_strategy,
max_error_rate=max_error_rate,
):
if result.type_ == "run_start":
yield BenchmarkerResult(
Expand Down Expand Up @@ -321,6 +329,7 @@ def create_benchmark_aggregator(
strategy=strategy,
max_number=limits.max_number,
max_duration=limits.max_duration,
max_error_rate=limits.max_error_rate,
warmup_number=limits.warmup_number,
warmup_duration=limits.warmup_duration,
cooldown_number=limits.cooldown_number,
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/benchmark/entrypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ async def benchmark_generative_text(
rate: Optional[Union[float, list[float]]],
max_seconds: Optional[float],
max_requests: Optional[int],
max_error_rate: Optional[float],
warmup_percent: Optional[float],
cooldown_percent: Optional[float],
show_progress: bool,
Expand Down Expand Up @@ -107,6 +108,7 @@ async def benchmark_generative_text(
profile=profile,
max_number_per_strategy=max_requests,
max_duration_per_strategy=max_seconds,
max_error_rate=max_error_rate,
warmup_percent_per_strategy=warmup_percent,
cooldown_percent_per_strategy=cooldown_percent,
):
Expand Down
1 change: 1 addition & 0 deletions src/guidellm/benchmark/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str:
{
"max_number": args.max_number,
"max_duration": args.max_duration,
"max_error_rate": args.max_error_rate,
"warmup_number": args.warmup_number,
"warmup_duration": args.warmup_duration,
"cooldown_number": args.cooldown_number,
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/request/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .loader import (
GenerativeRequestLoader,
GenerativeRequestLoaderDescription,
GetInfiniteDatasetLengthError,
RequestLoader,
RequestLoaderDescription,
)
Expand All @@ -10,6 +11,7 @@
"GenerationRequest",
"GenerativeRequestLoader",
"GenerativeRequestLoaderDescription",
"GetInfiniteDatasetLengthError",
"RequestLoader",
"RequestLoaderDescription",
]
11 changes: 10 additions & 1 deletion src/guidellm/request/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
__all__ = [
"GenerativeRequestLoader",
"GenerativeRequestLoaderDescription",
"GetInfiniteDatasetLengthError",
"RequestLoader",
"RequestLoaderDescription",
]


class GetInfiniteDatasetLengthError(Exception):
pass


class RequestLoaderDescription(StandardBaseModel):
type_: Literal["request_loader"] = "request_loader"

Expand Down Expand Up @@ -120,7 +125,11 @@ def __len__(self) -> int:
if self.iter_type == "finite":
return self.num_unique_items()

raise ValueError(f"Unable to determine length of dataset: {self.data}")
if self.iter_type != "infinite":
raise ValueError(f"Invalid iter_type {self.iter_type}")
raise GetInfiniteDatasetLengthError(f"Dataset {self.data} is "
f"infinite and thus "
f"unable to determine length")

@property
def description(self) -> GenerativeRequestLoaderDescription:
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/scheduler/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ class SchedulerRunInfo(StandardBaseModel):
end_number: float
processes: int
strategy: SchedulingStrategy
max_error_rate: Optional[float] = None

created_requests: int = 0
queued_requests: int = 0
scheduled_requests: int = 0
processing_requests: int = 0
completed_requests: int = 0
errored_requests: int = 0


class SchedulerRequestInfo(StandardBaseModel):
Expand Down
Loading
Loading