diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
index 35d993d8..82353ebd 100644
--- a/src/llama_stack_client/_client.py
+++ b/src/llama_stack_client/_client.py
@@ -39,6 +39,7 @@
     providers,
     telemetry,
     vector_io,
+    benchmarks,
     eval_tasks,
     toolgroups,
     vector_dbs,
@@ -94,6 +95,7 @@ class LlamaStackClient(SyncAPIClient):
     scoring: scoring.ScoringResource
     scoring_functions: scoring_functions.ScoringFunctionsResource
     eval_tasks: eval_tasks.EvalTasksResource
+    benchmarks: benchmarks.BenchmarksResource
     with_raw_response: LlamaStackClientWithRawResponse
     with_streaming_response: LlamaStackClientWithStreamedResponse
 
@@ -176,6 +178,7 @@ def __init__(
         self.scoring = scoring.ScoringResource(self)
         self.scoring_functions = scoring_functions.ScoringFunctionsResource(self)
         self.eval_tasks = eval_tasks.EvalTasksResource(self)
+        self.benchmarks = benchmarks.BenchmarksResource(self)
         self.with_raw_response = LlamaStackClientWithRawResponse(self)
         self.with_streaming_response = LlamaStackClientWithStreamedResponse(self)
 
@@ -310,6 +313,7 @@ class AsyncLlamaStackClient(AsyncAPIClient):
     scoring: scoring.AsyncScoringResource
     scoring_functions: scoring_functions.AsyncScoringFunctionsResource
     eval_tasks: eval_tasks.AsyncEvalTasksResource
+    benchmarks: benchmarks.AsyncBenchmarksResource
     with_raw_response: AsyncLlamaStackClientWithRawResponse
     with_streaming_response: AsyncLlamaStackClientWithStreamedResponse
 
@@ -392,6 +396,7 @@ def __init__(
         self.scoring = scoring.AsyncScoringResource(self)
         self.scoring_functions = scoring_functions.AsyncScoringFunctionsResource(self)
         self.eval_tasks = eval_tasks.AsyncEvalTasksResource(self)
+        self.benchmarks = benchmarks.AsyncBenchmarksResource(self)
         self.with_raw_response = AsyncLlamaStackClientWithRawResponse(self)
         self.with_streaming_response = AsyncLlamaStackClientWithStreamedResponse(self)
 
@@ -529,6 +534,7 @@ def __init__(self, client: LlamaStackClient) -> None:
         self.scoring = scoring.ScoringResourceWithRawResponse(client.scoring)
         self.scoring_functions = scoring_functions.ScoringFunctionsResourceWithRawResponse(client.scoring_functions)
         self.eval_tasks = eval_tasks.EvalTasksResourceWithRawResponse(client.eval_tasks)
+        self.benchmarks = benchmarks.BenchmarksResourceWithRawResponse(client.benchmarks)
 
 
 class AsyncLlamaStackClientWithRawResponse:
@@ -560,6 +566,7 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
             client.scoring_functions
         )
         self.eval_tasks = eval_tasks.AsyncEvalTasksResourceWithRawResponse(client.eval_tasks)
+        self.benchmarks = benchmarks.AsyncBenchmarksResourceWithRawResponse(client.benchmarks)
 
 
 class LlamaStackClientWithStreamedResponse:
@@ -591,6 +598,7 @@ def __init__(self, client: LlamaStackClient) -> None:
             client.scoring_functions
         )
         self.eval_tasks = eval_tasks.EvalTasksResourceWithStreamingResponse(client.eval_tasks)
+        self.benchmarks = benchmarks.BenchmarksResourceWithStreamingResponse(client.benchmarks)
 
 
 class AsyncLlamaStackClientWithStreamedResponse:
@@ -624,6 +632,7 @@ def __init__(self, client: AsyncLlamaStackClient) -> None:
             client.scoring_functions
         )
         self.eval_tasks = eval_tasks.AsyncEvalTasksResourceWithStreamingResponse(client.eval_tasks)
+        self.benchmarks = benchmarks.AsyncBenchmarksResourceWithStreamingResponse(client.benchmarks)
 
 
 Client = LlamaStackClient
diff --git a/src/llama_stack_client/_decoders/jsonl.py b/src/llama_stack_client/_decoders/jsonl.py
index e9d29a1c..ac5ac74f 100644
--- a/src/llama_stack_client/_decoders/jsonl.py
+++ b/src/llama_stack_client/_decoders/jsonl.py
@@ -17,11 +17,15 @@ class JSONLDecoder(Generic[_T]):
     into a given type.
     """
 
-    http_response: httpx.Response | None
+    http_response: httpx.Response
     """The HTTP response this decoder was constructed from"""
 
     def __init__(
-        self, *, raw_iterator: Iterator[bytes], line_type: type[_T], http_response: httpx.Response | None
+        self,
+        *,
+        raw_iterator: Iterator[bytes],
+        line_type: type[_T],
+        http_response: httpx.Response,
     ) -> None:
         super().__init__()
         self.http_response = http_response
@@ -29,6 +33,13 @@ def __init__(
         self._line_type = line_type
         self._iterator = self.__decode__()
 
+    def close(self) -> None:
+        """Close the response body stream.
+
+        This is called automatically if you consume the entire stream.
+        """
+        self.http_response.close()
+
     def __decode__(self) -> Iterator[_T]:
         buf = b""
         for chunk in self._raw_iterator:
@@ -63,10 +74,14 @@ class AsyncJSONLDecoder(Generic[_T]):
     into a given type.
     """
 
-    http_response: httpx.Response | None
+    http_response: httpx.Response
 
     def __init__(
-        self, *, raw_iterator: AsyncIterator[bytes], line_type: type[_T], http_response: httpx.Response | None
+        self,
+        *,
+        raw_iterator: AsyncIterator[bytes],
+        line_type: type[_T],
+        http_response: httpx.Response,
     ) -> None:
         super().__init__()
         self.http_response = http_response
@@ -74,6 +89,13 @@ def __init__(
         self._line_type = line_type
         self._iterator = self.__decode__()
 
+    async def close(self) -> None:
+        """Close the response body stream.
+
+        This is called automatically if you consume the entire stream.
+        """
+        await self.http_response.aclose()
+
     async def __decode__(self) -> AsyncIterator[_T]:
         buf = b""
         async for chunk in self._raw_iterator:
diff --git a/src/llama_stack_client/_models.py b/src/llama_stack_client/_models.py
index 12c34b7d..c4401ff8 100644
--- a/src/llama_stack_client/_models.py
+++ b/src/llama_stack_client/_models.py
@@ -426,10 +426,16 @@ def construct_type(*, value: object, type_: object) -> object:
 
     If the given value does not match the expected type then it is returned as-is.
     """
+
+    # store a reference to the original type we were given before we extract any inner
+    # types so that we can properly resolve forward references in `TypeAliasType` annotations
+    original_type = None
+
     # we allow `object` as the input type because otherwise, passing things like
     # `Literal['value']` will be reported as a type error by type checkers
     type_ = cast("type[object]", type_)
     if is_type_alias_type(type_):
+        original_type = type_  # type: ignore[unreachable]
         type_ = type_.__value__  # type: ignore[unreachable]
 
     # unwrap `Annotated[T, ...]` -> `T`
@@ -446,7 +452,7 @@ def construct_type(*, value: object, type_: object) -> object:
 
     if is_union(origin):
         try:
-            return validate_type(type_=cast("type[object]", type_), value=value)
+            return validate_type(type_=cast("type[object]", original_type or type_), value=value)
         except Exception:
             pass
 
diff --git a/src/llama_stack_client/_response.py b/src/llama_stack_client/_response.py
index d7e58fbe..ea35182f 100644
--- a/src/llama_stack_client/_response.py
+++ b/src/llama_stack_client/_response.py
@@ -144,7 +144,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
                 return cast(
                     R,
                     cast("type[JSONLDecoder[Any]]", cast_to)(
-                        raw_iterator=self.http_response.iter_bytes(chunk_size=4096),
+                        raw_iterator=self.http_response.iter_bytes(chunk_size=64),
                         line_type=extract_type_arg(cast_to, 0),
                         http_response=self.http_response,
                     ),
@@ -154,7 +154,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
                 return cast(
                     R,
                     cast("type[AsyncJSONLDecoder[Any]]", cast_to)(
-                        raw_iterator=self.http_response.aiter_bytes(chunk_size=4096),
+                        raw_iterator=self.http_response.aiter_bytes(chunk_size=64),
                         line_type=extract_type_arg(cast_to, 0),
                         http_response=self.http_response,
                     ),
diff --git a/src/llama_stack_client/_utils/_transform.py b/src/llama_stack_client/_utils/_transform.py
index a6b62cad..18afd9d8 100644
--- a/src/llama_stack_client/_utils/_transform.py
+++ b/src/llama_stack_client/_utils/_transform.py
@@ -25,7 +25,7 @@
     is_annotated_type,
     strip_annotated_type,
 )
-from .._compat import model_dump, is_typeddict
+from .._compat import get_origin, model_dump, is_typeddict
 
 _T = TypeVar("_T")
 
@@ -164,9 +164,14 @@ def _transform_recursive(
         inner_type = annotation
 
     stripped_type = strip_annotated_type(inner_type)
+    origin = get_origin(stripped_type) or stripped_type
     if is_typeddict(stripped_type) and is_mapping(data):
         return _transform_typeddict(data, stripped_type)
 
+    if origin == dict and is_mapping(data):
+        items_type = get_args(stripped_type)[1]
+        return {key: _transform_recursive(value, annotation=items_type) for key, value in data.items()}
+
     if (
         # List[T]
         (is_list_type(stripped_type) and is_list(data))
@@ -307,9 +312,14 @@ async def _async_transform_recursive(
         inner_type = annotation
 
     stripped_type = strip_annotated_type(inner_type)
+    origin = get_origin(stripped_type) or stripped_type
     if is_typeddict(stripped_type) and is_mapping(data):
         return await _async_transform_typeddict(data, stripped_type)
 
+    if origin == dict and is_mapping(data):
+        items_type = get_args(stripped_type)[1]
+        return {key: _transform_recursive(value, annotation=items_type) for key, value in data.items()}
+
     if (
         # List[T]
         (is_list_type(stripped_type) and is_list(data))
diff --git a/src/llama_stack_client/resources/__init__.py b/src/llama_stack_client/resources/__init__.py
index 42188633..b5e449c9 100644
--- a/src/llama_stack_client/resources/__init__.py
+++ b/src/llama_stack_client/resources/__init__.py
@@ -120,6 +120,14 @@
     VectorIoResourceWithStreamingResponse,
     AsyncVectorIoResourceWithStreamingResponse,
 )
+from .benchmarks import (
+    BenchmarksResource,
+    AsyncBenchmarksResource,
+    BenchmarksResourceWithRawResponse,
+    AsyncBenchmarksResourceWithRawResponse,
+    BenchmarksResourceWithStreamingResponse,
+    AsyncBenchmarksResourceWithStreamingResponse,
+)
 from .eval_tasks import (
     EvalTasksResource,
     AsyncEvalTasksResource,
@@ -324,4 +332,10 @@
     "AsyncEvalTasksResourceWithRawResponse",
     "EvalTasksResourceWithStreamingResponse",
     "AsyncEvalTasksResourceWithStreamingResponse",
+    "BenchmarksResource",
+    "AsyncBenchmarksResource",
+    "BenchmarksResourceWithRawResponse",
+    "AsyncBenchmarksResourceWithRawResponse",
+    "BenchmarksResourceWithStreamingResponse",
+    "AsyncBenchmarksResourceWithStreamingResponse",
 ]
diff --git a/src/llama_stack_client/resources/benchmarks.py b/src/llama_stack_client/resources/benchmarks.py
new file mode 100644
index 00000000..fe05e518
--- /dev/null
+++ b/src/llama_stack_client/resources/benchmarks.py
@@ -0,0 +1,328 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Type, Union, Iterable, Optional, cast
+
+import httpx
+
+from ..types import benchmark_register_params
+from .._types import NOT_GIVEN, Body, Query, Headers, NoneType, NotGiven
+from .._utils import (
+    maybe_transform,
+    async_maybe_transform,
+)
+from .._compat import cached_property
+from .._resource import SyncAPIResource, AsyncAPIResource
+from .._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
+)
+from .._wrappers import DataWrapper
+from .._base_client import make_request_options
+from ..types.benchmark import Benchmark
+from ..types.benchmark_list_response import BenchmarkListResponse
+
+__all__ = ["BenchmarksResource", "AsyncBenchmarksResource"]
+
+
+class BenchmarksResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> BenchmarksResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return BenchmarksResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> BenchmarksResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return BenchmarksResourceWithStreamingResponse(self)
+
+    def retrieve(
+        self,
+        benchmark_id: str,
+        *,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Optional[Benchmark]:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return self._get(
+            f"/v1/eval/benchmarks/{benchmark_id}",
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=Benchmark,
+        )
+
+    def list(
+        self,
+        *,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> BenchmarkListResponse:
+        return self._get(
+            "/v1/eval/benchmarks",
+            options=make_request_options(
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                post_parser=DataWrapper[BenchmarkListResponse]._unwrapper,
+            ),
+            cast_to=cast(Type[BenchmarkListResponse], DataWrapper[BenchmarkListResponse]),
+        )
+
+    def register(
+        self,
+        *,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        provider_benchmark_id: str | NotGiven = NOT_GIVEN,
+        provider_id: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> None:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        extra_headers = {"Accept": "*/*", **(extra_headers or {})}
+        return self._post(
+            "/v1/eval/benchmarks",
+            body=maybe_transform(
+                {
+                    "benchmark_id": benchmark_id,
+                    "dataset_id": dataset_id,
+                    "scoring_functions": scoring_functions,
+                    "metadata": metadata,
+                    "provider_benchmark_id": provider_benchmark_id,
+                    "provider_id": provider_id,
+                },
+                benchmark_register_params.BenchmarkRegisterParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=NoneType,
+        )
+
+
+class AsyncBenchmarksResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncBenchmarksResourceWithRawResponse:
+        """
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#accessing-raw-response-data-eg-headers
+        """
+        return AsyncBenchmarksResourceWithRawResponse(self)
+
+    @cached_property
+    def with_streaming_response(self) -> AsyncBenchmarksResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
+
+        For more information, see https://www.github.com/stainless-sdks/llama-stack-python#with_streaming_response
+        """
+        return AsyncBenchmarksResourceWithStreamingResponse(self)
+
+    async def retrieve(
+        self,
+        benchmark_id: str,
+        *,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Optional[Benchmark]:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return await self._get(
+            f"/v1/eval/benchmarks/{benchmark_id}",
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=Benchmark,
+        )
+
+    async def list(
+        self,
+        *,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> BenchmarkListResponse:
+        return await self._get(
+            "/v1/eval/benchmarks",
+            options=make_request_options(
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                post_parser=DataWrapper[BenchmarkListResponse]._unwrapper,
+            ),
+            cast_to=cast(Type[BenchmarkListResponse], DataWrapper[BenchmarkListResponse]),
+        )
+
+    async def register(
+        self,
+        *,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
+        provider_benchmark_id: str | NotGiven = NOT_GIVEN,
+        provider_id: str | NotGiven = NOT_GIVEN,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> None:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        extra_headers = {"Accept": "*/*", **(extra_headers or {})}
+        return await self._post(
+            "/v1/eval/benchmarks",
+            body=await async_maybe_transform(
+                {
+                    "benchmark_id": benchmark_id,
+                    "dataset_id": dataset_id,
+                    "scoring_functions": scoring_functions,
+                    "metadata": metadata,
+                    "provider_benchmark_id": provider_benchmark_id,
+                    "provider_id": provider_id,
+                },
+                benchmark_register_params.BenchmarkRegisterParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=NoneType,
+        )
+
+
+class BenchmarksResourceWithRawResponse:
+    def __init__(self, benchmarks: BenchmarksResource) -> None:
+        self._benchmarks = benchmarks
+
+        self.retrieve = to_raw_response_wrapper(
+            benchmarks.retrieve,
+        )
+        self.list = to_raw_response_wrapper(
+            benchmarks.list,
+        )
+        self.register = to_raw_response_wrapper(
+            benchmarks.register,
+        )
+
+
+class AsyncBenchmarksResourceWithRawResponse:
+    def __init__(self, benchmarks: AsyncBenchmarksResource) -> None:
+        self._benchmarks = benchmarks
+
+        self.retrieve = async_to_raw_response_wrapper(
+            benchmarks.retrieve,
+        )
+        self.list = async_to_raw_response_wrapper(
+            benchmarks.list,
+        )
+        self.register = async_to_raw_response_wrapper(
+            benchmarks.register,
+        )
+
+
+class BenchmarksResourceWithStreamingResponse:
+    def __init__(self, benchmarks: BenchmarksResource) -> None:
+        self._benchmarks = benchmarks
+
+        self.retrieve = to_streamed_response_wrapper(
+            benchmarks.retrieve,
+        )
+        self.list = to_streamed_response_wrapper(
+            benchmarks.list,
+        )
+        self.register = to_streamed_response_wrapper(
+            benchmarks.register,
+        )
+
+
+class AsyncBenchmarksResourceWithStreamingResponse:
+    def __init__(self, benchmarks: AsyncBenchmarksResource) -> None:
+        self._benchmarks = benchmarks
+
+        self.retrieve = async_to_streamed_response_wrapper(
+            benchmarks.retrieve,
+        )
+        self.list = async_to_streamed_response_wrapper(
+            benchmarks.list,
+        )
+        self.register = async_to_streamed_response_wrapper(
+            benchmarks.register,
+        )
diff --git a/src/llama_stack_client/resources/datasets.py b/src/llama_stack_client/resources/datasets.py
index 59457a45..144769f9 100644
--- a/src/llama_stack_client/resources/datasets.py
+++ b/src/llama_stack_client/resources/datasets.py
@@ -22,7 +22,6 @@
 )
 from .._wrappers import DataWrapper
 from .._base_client import make_request_options
-from ..types.shared_params.url import URL
 from ..types.dataset_list_response import DatasetListResponse
 from ..types.shared_params.param_type import ParamType
 from ..types.dataset_retrieve_response import DatasetRetrieveResponse
@@ -108,7 +107,7 @@ def register(
         *,
         dataset_id: str,
         dataset_schema: Dict[str, ParamType],
-        url: URL,
+        url: dataset_register_params.URL,
         metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
         provider_dataset_id: str | NotGiven = NOT_GIVEN,
         provider_id: str | NotGiven = NOT_GIVEN,
@@ -260,7 +259,7 @@ async def register(
         *,
         dataset_id: str,
         dataset_schema: Dict[str, ParamType],
-        url: URL,
+        url: dataset_register_params.URL,
         metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
         provider_dataset_id: str | NotGiven = NOT_GIVEN,
         provider_id: str | NotGiven = NOT_GIVEN,
diff --git a/src/llama_stack_client/resources/eval/eval.py b/src/llama_stack_client/resources/eval/eval.py
index 7795064a..053d2398 100644
--- a/src/llama_stack_client/resources/eval/eval.py
+++ b/src/llama_stack_client/resources/eval/eval.py
@@ -14,7 +14,12 @@
     JobsResourceWithStreamingResponse,
     AsyncJobsResourceWithStreamingResponse,
 )
-from ...types import eval_run_eval_params, eval_evaluate_rows_params
+from ...types import (
+    eval_run_eval_params,
+    eval_evaluate_rows_params,
+    eval_run_eval_alpha_params,
+    eval_evaluate_rows_alpha_params,
+)
 from ..._types import NOT_GIVEN, Body, Query, Headers, NotGiven
 from ..._utils import (
     maybe_transform,
@@ -31,7 +36,7 @@
 from ...types.job import Job
 from ..._base_client import make_request_options
 from ...types.evaluate_response import EvaluateResponse
-from ...types.eval_task_config_param import EvalTaskConfigParam
+from ...types.benchmark_config_param import BenchmarkConfigParam
 
 __all__ = ["EvalResource", "AsyncEvalResource"]
 
@@ -66,7 +71,7 @@ def evaluate_rows(
         *,
         input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfigParam,
+        task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -102,11 +107,53 @@ def evaluate_rows(
             cast_to=EvaluateResponse,
         )
 
+    def evaluate_rows_alpha(
+        self,
+        benchmark_id: str,
+        *,
+        input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfigParam,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> EvaluateResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return self._post(
+            f"/v1/eval/benchmarks/{benchmark_id}/evaluations",
+            body=maybe_transform(
+                {
+                    "input_rows": input_rows,
+                    "scoring_functions": scoring_functions,
+                    "task_config": task_config,
+                },
+                eval_evaluate_rows_alpha_params.EvalEvaluateRowsAlphaParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=EvaluateResponse,
+        )
+
     def run_eval(
         self,
         task_id: str,
         *,
-        task_config: EvalTaskConfigParam,
+        task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -135,6 +182,39 @@ def run_eval(
             cast_to=Job,
         )
 
+    def run_eval_alpha(
+        self,
+        benchmark_id: str,
+        *,
+        task_config: BenchmarkConfigParam,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Job:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return self._post(
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs",
+            body=maybe_transform({"task_config": task_config}, eval_run_eval_alpha_params.EvalRunEvalAlphaParams),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=Job,
+        )
+
 
 class AsyncEvalResource(AsyncAPIResource):
     @cached_property
@@ -166,7 +246,7 @@ async def evaluate_rows(
         *,
         input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfigParam,
+        task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -202,11 +282,53 @@ async def evaluate_rows(
             cast_to=EvaluateResponse,
         )
 
+    async def evaluate_rows_alpha(
+        self,
+        benchmark_id: str,
+        *,
+        input_rows: Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfigParam,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> EvaluateResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return await self._post(
+            f"/v1/eval/benchmarks/{benchmark_id}/evaluations",
+            body=await async_maybe_transform(
+                {
+                    "input_rows": input_rows,
+                    "scoring_functions": scoring_functions,
+                    "task_config": task_config,
+                },
+                eval_evaluate_rows_alpha_params.EvalEvaluateRowsAlphaParams,
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=EvaluateResponse,
+        )
+
     async def run_eval(
         self,
         task_id: str,
         *,
-        task_config: EvalTaskConfigParam,
+        task_config: BenchmarkConfigParam,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -235,6 +357,41 @@ async def run_eval(
             cast_to=Job,
         )
 
+    async def run_eval_alpha(
+        self,
+        benchmark_id: str,
+        *,
+        task_config: BenchmarkConfigParam,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
+    ) -> Job:
+        """
+        Args:
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
+        return await self._post(
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs",
+            body=await async_maybe_transform(
+                {"task_config": task_config}, eval_run_eval_alpha_params.EvalRunEvalAlphaParams
+            ),
+            options=make_request_options(
+                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+            ),
+            cast_to=Job,
+        )
+
 
 class EvalResourceWithRawResponse:
     def __init__(self, eval: EvalResource) -> None:
@@ -243,9 +400,15 @@ def __init__(self, eval: EvalResource) -> None:
         self.evaluate_rows = to_raw_response_wrapper(
             eval.evaluate_rows,
         )
+        self.evaluate_rows_alpha = to_raw_response_wrapper(
+            eval.evaluate_rows_alpha,
+        )
         self.run_eval = to_raw_response_wrapper(
             eval.run_eval,
         )
+        self.run_eval_alpha = to_raw_response_wrapper(
+            eval.run_eval_alpha,
+        )
 
     @cached_property
     def jobs(self) -> JobsResourceWithRawResponse:
@@ -259,9 +422,15 @@ def __init__(self, eval: AsyncEvalResource) -> None:
         self.evaluate_rows = async_to_raw_response_wrapper(
             eval.evaluate_rows,
         )
+        self.evaluate_rows_alpha = async_to_raw_response_wrapper(
+            eval.evaluate_rows_alpha,
+        )
         self.run_eval = async_to_raw_response_wrapper(
             eval.run_eval,
         )
+        self.run_eval_alpha = async_to_raw_response_wrapper(
+            eval.run_eval_alpha,
+        )
 
     @cached_property
     def jobs(self) -> AsyncJobsResourceWithRawResponse:
@@ -275,9 +444,15 @@ def __init__(self, eval: EvalResource) -> None:
         self.evaluate_rows = to_streamed_response_wrapper(
             eval.evaluate_rows,
         )
+        self.evaluate_rows_alpha = to_streamed_response_wrapper(
+            eval.evaluate_rows_alpha,
+        )
         self.run_eval = to_streamed_response_wrapper(
             eval.run_eval,
         )
+        self.run_eval_alpha = to_streamed_response_wrapper(
+            eval.run_eval_alpha,
+        )
 
     @cached_property
     def jobs(self) -> JobsResourceWithStreamingResponse:
@@ -291,9 +466,15 @@ def __init__(self, eval: AsyncEvalResource) -> None:
         self.evaluate_rows = async_to_streamed_response_wrapper(
             eval.evaluate_rows,
         )
+        self.evaluate_rows_alpha = async_to_streamed_response_wrapper(
+            eval.evaluate_rows_alpha,
+        )
         self.run_eval = async_to_streamed_response_wrapper(
             eval.run_eval,
         )
+        self.run_eval_alpha = async_to_streamed_response_wrapper(
+            eval.run_eval_alpha,
+        )
 
     @cached_property
     def jobs(self) -> AsyncJobsResourceWithStreamingResponse:
diff --git a/src/llama_stack_client/resources/eval/jobs.py b/src/llama_stack_client/resources/eval/jobs.py
index ba3e0777..d4d13f42 100644
--- a/src/llama_stack_client/resources/eval/jobs.py
+++ b/src/llama_stack_client/resources/eval/jobs.py
@@ -46,7 +46,7 @@ def retrieve(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -64,12 +64,12 @@ def retrieve(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         return self._get(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}/result",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -80,7 +80,7 @@ def cancel(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -98,13 +98,13 @@ def cancel(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         extra_headers = {"Accept": "*/*", **(extra_headers or {})}
         return self._delete(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -115,7 +115,7 @@ def status(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -133,12 +133,12 @@ def status(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         return self._get(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -170,7 +170,7 @@ async def retrieve(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -188,12 +188,12 @@ async def retrieve(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         return await self._get(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}/result",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -204,7 +204,7 @@ async def cancel(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -222,13 +222,13 @@ async def cancel(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         extra_headers = {"Accept": "*/*", **(extra_headers or {})}
         return await self._delete(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -239,7 +239,7 @@ async def status(
         self,
         job_id: str,
         *,
-        task_id: str,
+        benchmark_id: str,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -257,12 +257,12 @@ async def status(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        if not task_id:
-            raise ValueError(f"Expected a non-empty value for `task_id` but received {task_id!r}")
+        if not benchmark_id:
+            raise ValueError(f"Expected a non-empty value for `benchmark_id` but received {benchmark_id!r}")
         if not job_id:
             raise ValueError(f"Expected a non-empty value for `job_id` but received {job_id!r}")
         return await self._get(
-            f"/v1/eval/tasks/{task_id}/jobs/{job_id}",
+            f"/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/eval_tasks.py b/src/llama_stack_client/resources/eval_tasks.py
index 82a07839..40dbe8f2 100644
--- a/src/llama_stack_client/resources/eval_tasks.py
+++ b/src/llama_stack_client/resources/eval_tasks.py
@@ -22,8 +22,8 @@
 )
 from .._wrappers import DataWrapper
 from .._base_client import make_request_options
-from ..types.eval_task import EvalTask
-from ..types.eval_task_list_response import EvalTaskListResponse
+from ..types.benchmark import Benchmark
+from ..types.benchmark_list_response import BenchmarkListResponse
 
 __all__ = ["EvalTasksResource", "AsyncEvalTasksResource"]
 
@@ -58,7 +58,7 @@ def retrieve(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> Optional[EvalTask]:
+    ) -> Optional[Benchmark]:
         """
         Args:
           extra_headers: Send extra headers
@@ -76,7 +76,7 @@ def retrieve(
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
-            cast_to=EvalTask,
+            cast_to=Benchmark,
         )
 
     def list(
@@ -88,7 +88,7 @@ def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> EvalTaskListResponse:
+    ) -> BenchmarkListResponse:
         return self._get(
             "/v1/eval-tasks",
             options=make_request_options(
@@ -96,9 +96,9 @@ def list(
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[EvalTaskListResponse]._unwrapper,
+                post_parser=DataWrapper[BenchmarkListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[EvalTaskListResponse], DataWrapper[EvalTaskListResponse]),
+            cast_to=cast(Type[BenchmarkListResponse], DataWrapper[BenchmarkListResponse]),
         )
 
     def register(
@@ -108,7 +108,7 @@ def register(
         eval_task_id: str,
         scoring_functions: List[str],
         metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
-        provider_eval_task_id: str | NotGiven = NOT_GIVEN,
+        provider_benchmark_id: str | NotGiven = NOT_GIVEN,
         provider_id: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -136,7 +136,7 @@ def register(
                     "eval_task_id": eval_task_id,
                     "scoring_functions": scoring_functions,
                     "metadata": metadata,
-                    "provider_eval_task_id": provider_eval_task_id,
+                    "provider_benchmark_id": provider_benchmark_id,
                     "provider_id": provider_id,
                 },
                 eval_task_register_params.EvalTaskRegisterParams,
@@ -178,7 +178,7 @@ async def retrieve(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> Optional[EvalTask]:
+    ) -> Optional[Benchmark]:
         """
         Args:
           extra_headers: Send extra headers
@@ -196,7 +196,7 @@ async def retrieve(
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
-            cast_to=EvalTask,
+            cast_to=Benchmark,
         )
 
     async def list(
@@ -208,7 +208,7 @@ async def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
-    ) -> EvalTaskListResponse:
+    ) -> BenchmarkListResponse:
         return await self._get(
             "/v1/eval-tasks",
             options=make_request_options(
@@ -216,9 +216,9 @@ async def list(
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[EvalTaskListResponse]._unwrapper,
+                post_parser=DataWrapper[BenchmarkListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[EvalTaskListResponse], DataWrapper[EvalTaskListResponse]),
+            cast_to=cast(Type[BenchmarkListResponse], DataWrapper[BenchmarkListResponse]),
         )
 
     async def register(
@@ -228,7 +228,7 @@ async def register(
         eval_task_id: str,
         scoring_functions: List[str],
         metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
-        provider_eval_task_id: str | NotGiven = NOT_GIVEN,
+        provider_benchmark_id: str | NotGiven = NOT_GIVEN,
         provider_id: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -256,7 +256,7 @@ async def register(
                     "eval_task_id": eval_task_id,
                     "scoring_functions": scoring_functions,
                     "metadata": metadata,
-                    "provider_eval_task_id": provider_eval_task_id,
+                    "provider_benchmark_id": provider_benchmark_id,
                     "provider_id": provider_id,
                 },
                 eval_task_register_params.EvalTaskRegisterParams,
diff --git a/src/llama_stack_client/resources/tool_runtime/tool_runtime.py b/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
index f3b92a74..2bd7347b 100644
--- a/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
+++ b/src/llama_stack_client/resources/tool_runtime/tool_runtime.py
@@ -31,7 +31,6 @@
 from ..._base_client import make_request_options
 from ...types.tool_def import ToolDef
 from ..._decoders.jsonl import JSONLDecoder, AsyncJSONLDecoder
-from ...types.shared_params.url import URL
 from ...types.tool_invocation_result import ToolInvocationResult
 
 __all__ = ["ToolRuntimeResource", "AsyncToolRuntimeResource"]
@@ -103,7 +102,7 @@ def invoke_tool(
     def list_tools(
         self,
         *,
-        mcp_endpoint: URL | NotGiven = NOT_GIVEN,
+        mcp_endpoint: tool_runtime_list_tools_params.McpEndpoint | NotGiven = NOT_GIVEN,
         tool_group_id: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
@@ -209,7 +208,7 @@ async def invoke_tool(
     async def list_tools(
         self,
         *,
-        mcp_endpoint: URL | NotGiven = NOT_GIVEN,
+        mcp_endpoint: tool_runtime_list_tools_params.McpEndpoint | NotGiven = NOT_GIVEN,
         tool_group_id: str | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
diff --git a/src/llama_stack_client/resources/toolgroups.py b/src/llama_stack_client/resources/toolgroups.py
index 234be628..6a9b79d0 100644
--- a/src/llama_stack_client/resources/toolgroups.py
+++ b/src/llama_stack_client/resources/toolgroups.py
@@ -23,7 +23,6 @@
 from .._wrappers import DataWrapper
 from .._base_client import make_request_options
 from ..types.tool_group import ToolGroup
-from ..types.shared_params.url import URL
 from ..types.toolgroup_list_response import ToolgroupListResponse
 
 __all__ = ["ToolgroupsResource", "AsyncToolgroupsResource"]
@@ -109,7 +108,7 @@ def register(
         provider_id: str,
         toolgroup_id: str,
         args: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
-        mcp_endpoint: URL | NotGiven = NOT_GIVEN,
+        mcp_endpoint: toolgroup_register_params.McpEndpoint | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -262,7 +261,7 @@ async def register(
         provider_id: str,
         toolgroup_id: str,
         args: Dict[str, Union[bool, float, str, Iterable[object], object, None]] | NotGiven = NOT_GIVEN,
-        mcp_endpoint: URL | NotGiven = NOT_GIVEN,
+        mcp_endpoint: toolgroup_register_params.McpEndpoint | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 45824a74..bc94eb13 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -7,7 +7,6 @@
 from .model import Model as Model
 from .trace import Trace as Trace
 from .shared import (
-    URL as URL,
     Message as Message,
     Document as Document,
     ToolCall as ToolCall,
@@ -24,6 +23,7 @@
     SamplingParams as SamplingParams,
     BatchCompletion as BatchCompletion,
     SafetyViolation as SafetyViolation,
+    ToolCallOrString as ToolCallOrString,
     CompletionMessage as CompletionMessage,
     InterleavedContent as InterleavedContent,
     ToolParamDefinition as ToolParamDefinition,
@@ -34,7 +34,7 @@
 )
 from .shield import Shield as Shield
 from .tool_def import ToolDef as ToolDef
-from .eval_task import EvalTask as EvalTask
+from .benchmark import Benchmark as Benchmark
 from .route_info import RouteInfo as RouteInfo
 from .scoring_fn import ScoringFn as ScoringFn
 from .tool_group import ToolGroup as ToolGroup
@@ -77,13 +77,14 @@
 from .query_chunks_response import QueryChunksResponse as QueryChunksResponse
 from .query_condition_param import QueryConditionParam as QueryConditionParam
 from .algorithm_config_param import AlgorithmConfigParam as AlgorithmConfigParam
-from .eval_task_config_param import EvalTaskConfigParam as EvalTaskConfigParam
+from .benchmark_config_param import BenchmarkConfigParam as BenchmarkConfigParam
 from .list_datasets_response import ListDatasetsResponse as ListDatasetsResponse
 from .provider_list_response import ProviderListResponse as ProviderListResponse
 from .scoring_score_response import ScoringScoreResponse as ScoringScoreResponse
 from .shield_register_params import ShieldRegisterParams as ShieldRegisterParams
 from .tool_invocation_result import ToolInvocationResult as ToolInvocationResult
 from .vector_io_query_params import VectorIoQueryParams as VectorIoQueryParams
+from .benchmark_list_response import BenchmarkListResponse as BenchmarkListResponse
 from .dataset_register_params import DatasetRegisterParams as DatasetRegisterParams
 from .eval_task_list_response import EvalTaskListResponse as EvalTaskListResponse
 from .list_providers_response import ListProvidersResponse as ListProvidersResponse
@@ -91,15 +92,17 @@
 from .toolgroup_list_response import ToolgroupListResponse as ToolgroupListResponse
 from .vector_db_list_response import VectorDBListResponse as VectorDBListResponse
 from .vector_io_insert_params import VectorIoInsertParams as VectorIoInsertParams
-from .list_eval_tasks_response import ListEvalTasksResponse as ListEvalTasksResponse
+from .list_benchmarks_response import ListBenchmarksResponse as ListBenchmarksResponse
 from .list_vector_dbs_response import ListVectorDBsResponse as ListVectorDBsResponse
 from .safety_run_shield_params import SafetyRunShieldParams as SafetyRunShieldParams
+from .benchmark_register_params import BenchmarkRegisterParams as BenchmarkRegisterParams
 from .dataset_retrieve_response import DatasetRetrieveResponse as DatasetRetrieveResponse
 from .eval_evaluate_rows_params import EvalEvaluateRowsParams as EvalEvaluateRowsParams
 from .eval_task_register_params import EvalTaskRegisterParams as EvalTaskRegisterParams
 from .list_tool_groups_response import ListToolGroupsResponse as ListToolGroupsResponse
 from .toolgroup_register_params import ToolgroupRegisterParams as ToolgroupRegisterParams
 from .vector_db_register_params import VectorDBRegisterParams as VectorDBRegisterParams
+from .eval_run_eval_alpha_params import EvalRunEvalAlphaParams as EvalRunEvalAlphaParams
 from .scoring_score_batch_params import ScoringScoreBatchParams as ScoringScoreBatchParams
 from .telemetry_log_event_params import TelemetryLogEventParams as TelemetryLogEventParams
 from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams
@@ -115,6 +118,7 @@
 from .telemetry_get_span_tree_params import TelemetryGetSpanTreeParams as TelemetryGetSpanTreeParams
 from .telemetry_query_spans_response import TelemetryQuerySpansResponse as TelemetryQuerySpansResponse
 from .tool_runtime_list_tools_params import ToolRuntimeListToolsParams as ToolRuntimeListToolsParams
+from .eval_evaluate_rows_alpha_params import EvalEvaluateRowsAlphaParams as EvalEvaluateRowsAlphaParams
 from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse
 from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse
 from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams
diff --git a/src/llama_stack_client/types/agents/turn.py b/src/llama_stack_client/types/agents/turn.py
index 2ead7bbe..408d6446 100644
--- a/src/llama_stack_client/types/agents/turn.py
+++ b/src/llama_stack_client/types/agents/turn.py
@@ -6,7 +6,6 @@
 
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from ..shared.url import URL
 from ..inference_step import InferenceStep
 from ..shield_call_step import ShieldCallStep
 from ..shared.user_message import UserMessage
@@ -24,7 +23,9 @@
     "OutputAttachmentContent",
     "OutputAttachmentContentImageContentItem",
     "OutputAttachmentContentImageContentItemImage",
+    "OutputAttachmentContentImageContentItemImageURL",
     "OutputAttachmentContentTextContentItem",
+    "OutputAttachmentContentURL",
 ]
 
 InputMessage: TypeAlias = Union[UserMessage, ToolResponseMessage]
@@ -35,11 +36,15 @@
 ]
 
 
+class OutputAttachmentContentImageContentItemImageURL(BaseModel):
+    uri: str
+
+
 class OutputAttachmentContentImageContentItemImage(BaseModel):
     data: Optional[str] = None
     """base64 encoded image data as string"""
 
-    url: Optional[URL] = None
+    url: Optional[OutputAttachmentContentImageContentItemImageURL] = None
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
@@ -62,12 +67,16 @@ class OutputAttachmentContentTextContentItem(BaseModel):
     """Discriminator type of the content item. Always "text" """
 
 
+class OutputAttachmentContentURL(BaseModel):
+    uri: str
+
+
 OutputAttachmentContent: TypeAlias = Union[
     str,
     OutputAttachmentContentImageContentItem,
     OutputAttachmentContentTextContentItem,
     List[InterleavedContentItem],
-    URL,
+    OutputAttachmentContentURL,
 ]
 
 
diff --git a/src/llama_stack_client/types/agents/turn_create_params.py b/src/llama_stack_client/types/agents/turn_create_params.py
index fee300dd..92040b56 100644
--- a/src/llama_stack_client/types/agents/turn_create_params.py
+++ b/src/llama_stack_client/types/agents/turn_create_params.py
@@ -5,7 +5,6 @@
 from typing import Dict, List, Union, Iterable
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from ..shared_params.url import URL
 from ..shared_params.user_message import UserMessage
 from ..shared_params.tool_response_message import ToolResponseMessage
 from ..shared_params.interleaved_content_item import InterleavedContentItem
@@ -17,7 +16,9 @@
     "DocumentContent",
     "DocumentContentImageContentItem",
     "DocumentContentImageContentItemImage",
+    "DocumentContentImageContentItemImageURL",
     "DocumentContentTextContentItem",
+    "DocumentContentURL",
     "ToolConfig",
     "Toolgroup",
     "ToolgroupUnionMember1",
@@ -42,11 +43,15 @@ class TurnCreateParamsBase(TypedDict, total=False):
 Message: TypeAlias = Union[UserMessage, ToolResponseMessage]
 
 
+class DocumentContentImageContentItemImageURL(TypedDict, total=False):
+    uri: Required[str]
+
+
 class DocumentContentImageContentItemImage(TypedDict, total=False):
     data: str
     """base64 encoded image data as string"""
 
-    url: URL
+    url: DocumentContentImageContentItemImageURL
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
@@ -69,8 +74,16 @@ class DocumentContentTextContentItem(TypedDict, total=False):
     """Discriminator type of the content item. Always "text" """
 
 
+class DocumentContentURL(TypedDict, total=False):
+    uri: Required[str]
+
+
 DocumentContent: TypeAlias = Union[
-    str, DocumentContentImageContentItem, DocumentContentTextContentItem, Iterable[InterleavedContentItem], URL
+    str,
+    DocumentContentImageContentItem,
+    DocumentContentTextContentItem,
+    Iterable[InterleavedContentItem],
+    DocumentContentURL,
 ]
 
 
diff --git a/src/llama_stack_client/types/benchmark.py b/src/llama_stack_client/types/benchmark.py
new file mode 100644
index 00000000..3af66f6a
--- /dev/null
+++ b/src/llama_stack_client/types/benchmark.py
@@ -0,0 +1,24 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Dict, List, Union
+from typing_extensions import Literal
+
+from .._models import BaseModel
+
+__all__ = ["Benchmark"]
+
+
+class Benchmark(BaseModel):
+    dataset_id: str
+
+    identifier: str
+
+    metadata: Dict[str, Union[bool, float, str, List[object], object, None]]
+
+    provider_id: str
+
+    provider_resource_id: str
+
+    scoring_functions: List[str]
+
+    type: Literal["benchmark"]
diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py
new file mode 100644
index 00000000..48090c5f
--- /dev/null
+++ b/src/llama_stack_client/types/benchmark_config_param.py
@@ -0,0 +1,21 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict
+from typing_extensions import Literal, Required, TypedDict
+
+from .eval_candidate_param import EvalCandidateParam
+from .scoring_fn_params_param import ScoringFnParamsParam
+
+__all__ = ["BenchmarkConfigParam"]
+
+
+class BenchmarkConfigParam(TypedDict, total=False):
+    eval_candidate: Required[EvalCandidateParam]
+
+    scoring_params: Required[Dict[str, ScoringFnParamsParam]]
+
+    type: Required[Literal["benchmark"]]
+
+    num_examples: int
diff --git a/src/llama_stack_client/types/benchmark_list_response.py b/src/llama_stack_client/types/benchmark_list_response.py
new file mode 100644
index 00000000..b2e8ad2b
--- /dev/null
+++ b/src/llama_stack_client/types/benchmark_list_response.py
@@ -0,0 +1,10 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List
+from typing_extensions import TypeAlias
+
+from .benchmark import Benchmark
+
+__all__ = ["BenchmarkListResponse"]
+
+BenchmarkListResponse: TypeAlias = List[Benchmark]
diff --git a/src/llama_stack_client/types/benchmark_register_params.py b/src/llama_stack_client/types/benchmark_register_params.py
new file mode 100644
index 00000000..def970a1
--- /dev/null
+++ b/src/llama_stack_client/types/benchmark_register_params.py
@@ -0,0 +1,22 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Union, Iterable
+from typing_extensions import Required, TypedDict
+
+__all__ = ["BenchmarkRegisterParams"]
+
+
+class BenchmarkRegisterParams(TypedDict, total=False):
+    benchmark_id: Required[str]
+
+    dataset_id: Required[str]
+
+    scoring_functions: Required[List[str]]
+
+    metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+
+    provider_benchmark_id: str
+
+    provider_id: str
diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
index 99916add..f032c233 100644
--- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
+++ b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
@@ -1,13 +1,14 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import List, Optional
+from typing import Dict, List, Union, Optional
+from datetime import datetime
 from typing_extensions import Literal
 
 from .._models import BaseModel
 from .token_log_probs import TokenLogProbs
 from .shared.content_delta import ContentDelta
 
-__all__ = ["ChatCompletionResponseStreamChunk", "Event"]
+__all__ = ["ChatCompletionResponseStreamChunk", "Event", "Metric"]
 
 
 class Event(BaseModel):
@@ -27,6 +28,26 @@ class Event(BaseModel):
     """Optional reason why generation stopped, if complete"""
 
 
+class Metric(BaseModel):
+    metric: str
+
+    span_id: str
+
+    timestamp: datetime
+
+    trace_id: str
+
+    type: Literal["metric"]
+
+    unit: str
+
+    value: float
+
+    attributes: Optional[Dict[str, Union[str, float, bool, None]]] = None
+
+
 class ChatCompletionResponseStreamChunk(BaseModel):
     event: Event
     """The event containing the new content"""
+
+    metrics: Optional[List[Metric]] = None
diff --git a/src/llama_stack_client/types/dataset_list_response.py b/src/llama_stack_client/types/dataset_list_response.py
index 0051669b..1dc2afa4 100644
--- a/src/llama_stack_client/types/dataset_list_response.py
+++ b/src/llama_stack_client/types/dataset_list_response.py
@@ -4,10 +4,13 @@
 from typing_extensions import Literal, TypeAlias
 
 from .._models import BaseModel
-from .shared.url import URL
 from .shared.param_type import ParamType
 
-__all__ = ["DatasetListResponse", "DatasetListResponseItem"]
+__all__ = ["DatasetListResponse", "DatasetListResponseItem", "DatasetListResponseItemURL"]
+
+
+class DatasetListResponseItemURL(BaseModel):
+    uri: str
 
 
 class DatasetListResponseItem(BaseModel):
@@ -23,7 +26,7 @@ class DatasetListResponseItem(BaseModel):
 
     type: Literal["dataset"]
 
-    url: URL
+    url: DatasetListResponseItemURL
 
 
 DatasetListResponse: TypeAlias = List[DatasetListResponseItem]
diff --git a/src/llama_stack_client/types/dataset_register_params.py b/src/llama_stack_client/types/dataset_register_params.py
index 853485a6..1c1cf234 100644
--- a/src/llama_stack_client/types/dataset_register_params.py
+++ b/src/llama_stack_client/types/dataset_register_params.py
@@ -5,10 +5,9 @@
 from typing import Dict, Union, Iterable
 from typing_extensions import Required, TypedDict
 
-from .shared_params.url import URL
 from .shared_params.param_type import ParamType
 
-__all__ = ["DatasetRegisterParams"]
+__all__ = ["DatasetRegisterParams", "URL"]
 
 
 class DatasetRegisterParams(TypedDict, total=False):
@@ -23,3 +22,7 @@ class DatasetRegisterParams(TypedDict, total=False):
     provider_dataset_id: str
 
     provider_id: str
+
+
+class URL(TypedDict, total=False):
+    uri: Required[str]
diff --git a/src/llama_stack_client/types/dataset_retrieve_response.py b/src/llama_stack_client/types/dataset_retrieve_response.py
index 31d7ab33..bd819a56 100644
--- a/src/llama_stack_client/types/dataset_retrieve_response.py
+++ b/src/llama_stack_client/types/dataset_retrieve_response.py
@@ -4,10 +4,13 @@
 from typing_extensions import Literal
 
 from .._models import BaseModel
-from .shared.url import URL
 from .shared.param_type import ParamType
 
-__all__ = ["DatasetRetrieveResponse"]
+__all__ = ["DatasetRetrieveResponse", "URL"]
+
+
+class URL(BaseModel):
+    uri: str
 
 
 class DatasetRetrieveResponse(BaseModel):
diff --git a/src/llama_stack_client/types/eval_evaluate_rows_alpha_params.py b/src/llama_stack_client/types/eval_evaluate_rows_alpha_params.py
new file mode 100644
index 00000000..9758e814
--- /dev/null
+++ b/src/llama_stack_client/types/eval_evaluate_rows_alpha_params.py
@@ -0,0 +1,18 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing import Dict, List, Union, Iterable
+from typing_extensions import Required, TypedDict
+
+from .benchmark_config_param import BenchmarkConfigParam
+
+__all__ = ["EvalEvaluateRowsAlphaParams"]
+
+
+class EvalEvaluateRowsAlphaParams(TypedDict, total=False):
+    input_rows: Required[Iterable[Dict[str, Union[bool, float, str, Iterable[object], object, None]]]]
+
+    scoring_functions: Required[List[str]]
+
+    task_config: Required[BenchmarkConfigParam]
diff --git a/src/llama_stack_client/types/eval_evaluate_rows_params.py b/src/llama_stack_client/types/eval_evaluate_rows_params.py
index 065764b5..86cdde00 100644
--- a/src/llama_stack_client/types/eval_evaluate_rows_params.py
+++ b/src/llama_stack_client/types/eval_evaluate_rows_params.py
@@ -5,7 +5,7 @@
 from typing import Dict, List, Union, Iterable
 from typing_extensions import Required, TypedDict
 
-from .eval_task_config_param import EvalTaskConfigParam
+from .benchmark_config_param import BenchmarkConfigParam
 
 __all__ = ["EvalEvaluateRowsParams"]
 
@@ -15,4 +15,4 @@ class EvalEvaluateRowsParams(TypedDict, total=False):
 
     scoring_functions: Required[List[str]]
 
-    task_config: Required[EvalTaskConfigParam]
+    task_config: Required[BenchmarkConfigParam]
diff --git a/src/llama_stack_client/types/eval_run_eval_alpha_params.py b/src/llama_stack_client/types/eval_run_eval_alpha_params.py
new file mode 100644
index 00000000..3ca2521a
--- /dev/null
+++ b/src/llama_stack_client/types/eval_run_eval_alpha_params.py
@@ -0,0 +1,13 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+from typing_extensions import Required, TypedDict
+
+from .benchmark_config_param import BenchmarkConfigParam
+
+__all__ = ["EvalRunEvalAlphaParams"]
+
+
+class EvalRunEvalAlphaParams(TypedDict, total=False):
+    task_config: Required[BenchmarkConfigParam]
diff --git a/src/llama_stack_client/types/eval_run_eval_params.py b/src/llama_stack_client/types/eval_run_eval_params.py
index 9ee91af8..a5715f29 100644
--- a/src/llama_stack_client/types/eval_run_eval_params.py
+++ b/src/llama_stack_client/types/eval_run_eval_params.py
@@ -4,10 +4,10 @@
 
 from typing_extensions import Required, TypedDict
 
-from .eval_task_config_param import EvalTaskConfigParam
+from .benchmark_config_param import BenchmarkConfigParam
 
 __all__ = ["EvalRunEvalParams"]
 
 
 class EvalRunEvalParams(TypedDict, total=False):
-    task_config: Required[EvalTaskConfigParam]
+    task_config: Required[BenchmarkConfigParam]
diff --git a/src/llama_stack_client/types/eval_task_list_response.py b/src/llama_stack_client/types/eval_task_list_response.py
index 11646563..c1260673 100644
--- a/src/llama_stack_client/types/eval_task_list_response.py
+++ b/src/llama_stack_client/types/eval_task_list_response.py
@@ -3,8 +3,8 @@
 from typing import List
 from typing_extensions import TypeAlias
 
-from .eval_task import EvalTask
+from .benchmark import Benchmark
 
 __all__ = ["EvalTaskListResponse"]
 
-EvalTaskListResponse: TypeAlias = List[EvalTask]
+EvalTaskListResponse: TypeAlias = List[Benchmark]
diff --git a/src/llama_stack_client/types/eval_task_register_params.py b/src/llama_stack_client/types/eval_task_register_params.py
index 417bc2cd..26934c67 100644
--- a/src/llama_stack_client/types/eval_task_register_params.py
+++ b/src/llama_stack_client/types/eval_task_register_params.py
@@ -17,6 +17,6 @@ class EvalTaskRegisterParams(TypedDict, total=False):
 
     metadata: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
 
-    provider_eval_task_id: str
+    provider_benchmark_id: str
 
     provider_id: str
diff --git a/src/llama_stack_client/types/event_param.py b/src/llama_stack_client/types/event_param.py
index 7505d6f7..500e4a24 100644
--- a/src/llama_stack_client/types/event_param.py
+++ b/src/llama_stack_client/types/event_param.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Dict, Union, Iterable
+from typing import Dict, Union
 from datetime import datetime
 from typing_extensions import Literal, Required, Annotated, TypeAlias, TypedDict
 
@@ -32,7 +32,7 @@ class UnstructuredLogEvent(TypedDict, total=False):
 
     type: Required[Literal["unstructured_log"]]
 
-    attributes: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+    attributes: Dict[str, Union[str, float, bool, None]]
 
 
 class MetricEvent(TypedDict, total=False):
@@ -50,7 +50,7 @@ class MetricEvent(TypedDict, total=False):
 
     value: Required[float]
 
-    attributes: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+    attributes: Dict[str, Union[str, float, bool, None]]
 
 
 class StructuredLogEventPayloadSpanStartPayload(TypedDict, total=False):
@@ -83,7 +83,7 @@ class StructuredLogEvent(TypedDict, total=False):
 
     type: Required[Literal["structured_log"]]
 
-    attributes: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
+    attributes: Dict[str, Union[str, float, bool, None]]
 
 
 EventParam: TypeAlias = Union[UnstructuredLogEvent, MetricEvent, StructuredLogEvent]
diff --git a/src/llama_stack_client/types/list_benchmarks_response.py b/src/llama_stack_client/types/list_benchmarks_response.py
new file mode 100644
index 00000000..4185f3d1
--- /dev/null
+++ b/src/llama_stack_client/types/list_benchmarks_response.py
@@ -0,0 +1,11 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+
+from .._models import BaseModel
+from .benchmark_list_response import BenchmarkListResponse
+
+__all__ = ["ListBenchmarksResponse"]
+
+
+class ListBenchmarksResponse(BaseModel):
+    data: BenchmarkListResponse
diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py
index 075a664d..0fe46810 100644
--- a/src/llama_stack_client/types/shared/__init__.py
+++ b/src/llama_stack_client/types/shared/__init__.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from .url import URL as URL
 from .message import Message as Message
 from .document import Document as Document
 from .tool_call import ToolCall as ToolCall
@@ -19,6 +18,7 @@
 from .safety_violation import SafetyViolation as SafetyViolation
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
+from .tool_call_or_string import ToolCallOrString as ToolCallOrString
 from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
 from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py
index e8c5071e..2d0002a9 100644
--- a/src/llama_stack_client/types/shared/chat_completion_response.py
+++ b/src/llama_stack_client/types/shared/chat_completion_response.py
@@ -1,12 +1,32 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import List, Optional
+from typing import Dict, List, Union, Optional
+from datetime import datetime
+from typing_extensions import Literal
 
 from ..._models import BaseModel
 from ..token_log_probs import TokenLogProbs
 from .completion_message import CompletionMessage
 
-__all__ = ["ChatCompletionResponse"]
+__all__ = ["ChatCompletionResponse", "Metric"]
+
+
+class Metric(BaseModel):
+    metric: str
+
+    span_id: str
+
+    timestamp: datetime
+
+    trace_id: str
+
+    type: Literal["metric"]
+
+    unit: str
+
+    value: float
+
+    attributes: Optional[Dict[str, Union[str, float, bool, None]]] = None
 
 
 class ChatCompletionResponse(BaseModel):
@@ -15,3 +35,5 @@ class ChatCompletionResponse(BaseModel):
 
     logprobs: Optional[List[TokenLogProbs]] = None
     """Optional log probabilities for generated tokens"""
+
+    metrics: Optional[List[Metric]] = None
diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py
index 18207c75..ae036ad8 100644
--- a/src/llama_stack_client/types/shared/content_delta.py
+++ b/src/llama_stack_client/types/shared/content_delta.py
@@ -5,9 +5,9 @@
 
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from .tool_call import ToolCall
+from .tool_call_or_string import ToolCallOrString
 
-__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"]
+__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta"]
 
 
 class TextDelta(BaseModel):
@@ -22,13 +22,10 @@ class ImageDelta(BaseModel):
     type: Literal["image"]
 
 
-ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
-
-
 class ToolCallDelta(BaseModel):
     parse_status: Literal["started", "in_progress", "failed", "succeeded"]
 
-    tool_call: ToolCallDeltaToolCall
+    tool_call: ToolCallOrString
 
     type: Literal["tool_call"]
 
diff --git a/src/llama_stack_client/types/shared/document.py b/src/llama_stack_client/types/shared/document.py
index 1282bd0a..b9bfa898 100644
--- a/src/llama_stack_client/types/shared/document.py
+++ b/src/llama_stack_client/types/shared/document.py
@@ -3,18 +3,29 @@
 from typing import Dict, List, Union, Optional
 from typing_extensions import Literal, TypeAlias
 
-from .url import URL
 from ..._models import BaseModel
 from .interleaved_content_item import InterleavedContentItem
 
-__all__ = ["Document", "Content", "ContentImageContentItem", "ContentImageContentItemImage", "ContentTextContentItem"]
+__all__ = [
+    "Document",
+    "Content",
+    "ContentImageContentItem",
+    "ContentImageContentItemImage",
+    "ContentImageContentItemImageURL",
+    "ContentTextContentItem",
+    "ContentURL",
+]
+
+
+class ContentImageContentItemImageURL(BaseModel):
+    uri: str
 
 
 class ContentImageContentItemImage(BaseModel):
     data: Optional[str] = None
     """base64 encoded image data as string"""
 
-    url: Optional[URL] = None
+    url: Optional[ContentImageContentItemImageURL] = None
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
@@ -37,7 +48,13 @@ class ContentTextContentItem(BaseModel):
     """Discriminator type of the content item. Always "text" """
 
 
-Content: TypeAlias = Union[str, ContentImageContentItem, ContentTextContentItem, List[InterleavedContentItem], URL]
+class ContentURL(BaseModel):
+    uri: str
+
+
+Content: TypeAlias = Union[
+    str, ContentImageContentItem, ContentTextContentItem, List[InterleavedContentItem], ContentURL
+]
 
 
 class Document(BaseModel):
diff --git a/src/llama_stack_client/types/shared/interleaved_content.py b/src/llama_stack_client/types/shared/interleaved_content.py
index 02a9b43e..dc496150 100644
--- a/src/llama_stack_client/types/shared/interleaved_content.py
+++ b/src/llama_stack_client/types/shared/interleaved_content.py
@@ -3,18 +3,27 @@
 from typing import List, Union, Optional
 from typing_extensions import Literal, TypeAlias
 
-from .url import URL
 from ..._models import BaseModel
 from .interleaved_content_item import InterleavedContentItem
 
-__all__ = ["InterleavedContent", "ImageContentItem", "ImageContentItemImage", "TextContentItem"]
+__all__ = [
+    "InterleavedContent",
+    "ImageContentItem",
+    "ImageContentItemImage",
+    "ImageContentItemImageURL",
+    "TextContentItem",
+]
+
+
+class ImageContentItemImageURL(BaseModel):
+    uri: str
 
 
 class ImageContentItemImage(BaseModel):
     data: Optional[str] = None
     """base64 encoded image data as string"""
 
-    url: Optional[URL] = None
+    url: Optional[ImageContentItemImageURL] = None
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
diff --git a/src/llama_stack_client/types/shared/interleaved_content_item.py b/src/llama_stack_client/types/shared/interleaved_content_item.py
index c7030b1c..8a3238b8 100644
--- a/src/llama_stack_client/types/shared/interleaved_content_item.py
+++ b/src/llama_stack_client/types/shared/interleaved_content_item.py
@@ -3,18 +3,27 @@
 from typing import Union, Optional
 from typing_extensions import Literal, Annotated, TypeAlias
 
-from .url import URL
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
 
-__all__ = ["InterleavedContentItem", "ImageContentItem", "ImageContentItemImage", "TextContentItem"]
+__all__ = [
+    "InterleavedContentItem",
+    "ImageContentItem",
+    "ImageContentItemImage",
+    "ImageContentItemImageURL",
+    "TextContentItem",
+]
+
+
+class ImageContentItemImageURL(BaseModel):
+    uri: str
 
 
 class ImageContentItemImage(BaseModel):
     data: Optional[str] = None
     """base64 encoded image data as string"""
 
-    url: Optional[URL] = None
+    url: Optional[ImageContentItemImageURL] = None
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
diff --git a/src/llama_stack_client/types/shared/tool_call_or_string.py b/src/llama_stack_client/types/shared/tool_call_or_string.py
new file mode 100644
index 00000000..f52a0d98
--- /dev/null
+++ b/src/llama_stack_client/types/shared/tool_call_or_string.py
@@ -0,0 +1,10 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import Union
+from typing_extensions import TypeAlias
+
+from .tool_call import ToolCall
+
+__all__ = ["ToolCallOrString"]
+
+ToolCallOrString: TypeAlias = Union[str, ToolCall]
diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py
index 8c2041a6..d647c238 100644
--- a/src/llama_stack_client/types/shared_params/__init__.py
+++ b/src/llama_stack_client/types/shared_params/__init__.py
@@ -1,6 +1,5 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from .url import URL as URL
 from .message import Message as Message
 from .document import Document as Document
 from .tool_call import ToolCall as ToolCall
diff --git a/src/llama_stack_client/types/shared_params/document.py b/src/llama_stack_client/types/shared_params/document.py
index fd464554..fd3c3df1 100644
--- a/src/llama_stack_client/types/shared_params/document.py
+++ b/src/llama_stack_client/types/shared_params/document.py
@@ -5,17 +5,28 @@
 from typing import Dict, Union, Iterable
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .url import URL
 from .interleaved_content_item import InterleavedContentItem
 
-__all__ = ["Document", "Content", "ContentImageContentItem", "ContentImageContentItemImage", "ContentTextContentItem"]
+__all__ = [
+    "Document",
+    "Content",
+    "ContentImageContentItem",
+    "ContentImageContentItemImage",
+    "ContentImageContentItemImageURL",
+    "ContentTextContentItem",
+    "ContentURL",
+]
+
+
+class ContentImageContentItemImageURL(TypedDict, total=False):
+    uri: Required[str]
 
 
 class ContentImageContentItemImage(TypedDict, total=False):
     data: str
     """base64 encoded image data as string"""
 
-    url: URL
+    url: ContentImageContentItemImageURL
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
@@ -38,7 +49,13 @@ class ContentTextContentItem(TypedDict, total=False):
     """Discriminator type of the content item. Always "text" """
 
 
-Content: TypeAlias = Union[str, ContentImageContentItem, ContentTextContentItem, Iterable[InterleavedContentItem], URL]
+class ContentURL(TypedDict, total=False):
+    uri: Required[str]
+
+
+Content: TypeAlias = Union[
+    str, ContentImageContentItem, ContentTextContentItem, Iterable[InterleavedContentItem], ContentURL
+]
 
 
 class Document(TypedDict, total=False):
diff --git a/src/llama_stack_client/types/shared_params/interleaved_content.py b/src/llama_stack_client/types/shared_params/interleaved_content.py
index 8d5605fb..5d045a20 100644
--- a/src/llama_stack_client/types/shared_params/interleaved_content.py
+++ b/src/llama_stack_client/types/shared_params/interleaved_content.py
@@ -5,17 +5,26 @@
 from typing import Union, Iterable
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .url import URL
 from .interleaved_content_item import InterleavedContentItem
 
-__all__ = ["InterleavedContent", "ImageContentItem", "ImageContentItemImage", "TextContentItem"]
+__all__ = [
+    "InterleavedContent",
+    "ImageContentItem",
+    "ImageContentItemImage",
+    "ImageContentItemImageURL",
+    "TextContentItem",
+]
+
+
+class ImageContentItemImageURL(TypedDict, total=False):
+    uri: Required[str]
 
 
 class ImageContentItemImage(TypedDict, total=False):
     data: str
     """base64 encoded image data as string"""
 
-    url: URL
+    url: ImageContentItemImageURL
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
diff --git a/src/llama_stack_client/types/shared_params/interleaved_content_item.py b/src/llama_stack_client/types/shared_params/interleaved_content_item.py
index acb7e6f1..b5c0bcc1 100644
--- a/src/llama_stack_client/types/shared_params/interleaved_content_item.py
+++ b/src/llama_stack_client/types/shared_params/interleaved_content_item.py
@@ -5,16 +5,24 @@
 from typing import Union
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .url import URL
+__all__ = [
+    "InterleavedContentItem",
+    "ImageContentItem",
+    "ImageContentItemImage",
+    "ImageContentItemImageURL",
+    "TextContentItem",
+]
 
-__all__ = ["InterleavedContentItem", "ImageContentItem", "ImageContentItemImage", "TextContentItem"]
+
+class ImageContentItemImageURL(TypedDict, total=False):
+    uri: Required[str]
 
 
 class ImageContentItemImage(TypedDict, total=False):
     data: str
     """base64 encoded image data as string"""
 
-    url: URL
+    url: ImageContentItemImageURL
     """A URL of the image or data URL in the format of data:image/{type};base64,{data}.
 
     Note that URL could have length limits.
diff --git a/src/llama_stack_client/types/tool_group.py b/src/llama_stack_client/types/tool_group.py
index 82d2e057..480d1942 100644
--- a/src/llama_stack_client/types/tool_group.py
+++ b/src/llama_stack_client/types/tool_group.py
@@ -4,9 +4,12 @@
 from typing_extensions import Literal
 
 from .._models import BaseModel
-from .shared.url import URL
 
-__all__ = ["ToolGroup"]
+__all__ = ["ToolGroup", "McpEndpoint"]
+
+
+class McpEndpoint(BaseModel):
+    uri: str
 
 
 class ToolGroup(BaseModel):
@@ -20,4 +23,4 @@ class ToolGroup(BaseModel):
 
     args: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
 
-    mcp_endpoint: Optional[URL] = None
+    mcp_endpoint: Optional[McpEndpoint] = None
diff --git a/src/llama_stack_client/types/tool_runtime_list_tools_params.py b/src/llama_stack_client/types/tool_runtime_list_tools_params.py
index 7db74244..99da7533 100644
--- a/src/llama_stack_client/types/tool_runtime_list_tools_params.py
+++ b/src/llama_stack_client/types/tool_runtime_list_tools_params.py
@@ -2,14 +2,16 @@
 
 from __future__ import annotations
 
-from typing_extensions import TypedDict
+from typing_extensions import Required, TypedDict
 
-from .shared_params.url import URL
-
-__all__ = ["ToolRuntimeListToolsParams"]
+__all__ = ["ToolRuntimeListToolsParams", "McpEndpoint"]
 
 
 class ToolRuntimeListToolsParams(TypedDict, total=False):
-    mcp_endpoint: URL
+    mcp_endpoint: McpEndpoint
 
     tool_group_id: str
+
+
+class McpEndpoint(TypedDict, total=False):
+    uri: Required[str]
diff --git a/src/llama_stack_client/types/toolgroup_register_params.py b/src/llama_stack_client/types/toolgroup_register_params.py
index 1184be85..8cb7af7f 100644
--- a/src/llama_stack_client/types/toolgroup_register_params.py
+++ b/src/llama_stack_client/types/toolgroup_register_params.py
@@ -5,9 +5,7 @@
 from typing import Dict, Union, Iterable
 from typing_extensions import Required, TypedDict
 
-from .shared_params.url import URL
-
-__all__ = ["ToolgroupRegisterParams"]
+__all__ = ["ToolgroupRegisterParams", "McpEndpoint"]
 
 
 class ToolgroupRegisterParams(TypedDict, total=False):
@@ -17,4 +15,8 @@ class ToolgroupRegisterParams(TypedDict, total=False):
 
     args: Dict[str, Union[bool, float, str, Iterable[object], object, None]]
 
-    mcp_endpoint: URL
+    mcp_endpoint: McpEndpoint
+
+
+class McpEndpoint(TypedDict, total=False):
+    uri: Required[str]
diff --git a/tests/api_resources/eval/test_jobs.py b/tests/api_resources/eval/test_jobs.py
index beb290a0..f9b85a08 100644
--- a/tests/api_resources/eval/test_jobs.py
+++ b/tests/api_resources/eval/test_jobs.py
@@ -22,7 +22,7 @@ class TestJobs:
     def test_method_retrieve(self, client: LlamaStackClient) -> None:
         job = client.eval.jobs.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert_matches_type(EvaluateResponse, job, path=["response"])
 
@@ -30,7 +30,7 @@ def test_method_retrieve(self, client: LlamaStackClient) -> None:
     def test_raw_response_retrieve(self, client: LlamaStackClient) -> None:
         response = client.eval.jobs.with_raw_response.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -42,7 +42,7 @@ def test_raw_response_retrieve(self, client: LlamaStackClient) -> None:
     def test_streaming_response_retrieve(self, client: LlamaStackClient) -> None:
         with client.eval.jobs.with_streaming_response.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -54,23 +54,23 @@ def test_streaming_response_retrieve(self, client: LlamaStackClient) -> None:
 
     @parametrize
     def test_path_params_retrieve(self, client: LlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             client.eval.jobs.with_raw_response.retrieve(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             client.eval.jobs.with_raw_response.retrieve(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
 
     @parametrize
     def test_method_cancel(self, client: LlamaStackClient) -> None:
         job = client.eval.jobs.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert job is None
 
@@ -78,7 +78,7 @@ def test_method_cancel(self, client: LlamaStackClient) -> None:
     def test_raw_response_cancel(self, client: LlamaStackClient) -> None:
         response = client.eval.jobs.with_raw_response.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -90,7 +90,7 @@ def test_raw_response_cancel(self, client: LlamaStackClient) -> None:
     def test_streaming_response_cancel(self, client: LlamaStackClient) -> None:
         with client.eval.jobs.with_streaming_response.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -102,23 +102,23 @@ def test_streaming_response_cancel(self, client: LlamaStackClient) -> None:
 
     @parametrize
     def test_path_params_cancel(self, client: LlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             client.eval.jobs.with_raw_response.cancel(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             client.eval.jobs.with_raw_response.cancel(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
 
     @parametrize
     def test_method_status(self, client: LlamaStackClient) -> None:
         job = client.eval.jobs.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert_matches_type(Optional[JobStatusResponse], job, path=["response"])
 
@@ -126,7 +126,7 @@ def test_method_status(self, client: LlamaStackClient) -> None:
     def test_raw_response_status(self, client: LlamaStackClient) -> None:
         response = client.eval.jobs.with_raw_response.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -138,7 +138,7 @@ def test_raw_response_status(self, client: LlamaStackClient) -> None:
     def test_streaming_response_status(self, client: LlamaStackClient) -> None:
         with client.eval.jobs.with_streaming_response.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -150,16 +150,16 @@ def test_streaming_response_status(self, client: LlamaStackClient) -> None:
 
     @parametrize
     def test_path_params_status(self, client: LlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             client.eval.jobs.with_raw_response.status(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             client.eval.jobs.with_raw_response.status(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
 
 
@@ -170,7 +170,7 @@ class TestAsyncJobs:
     async def test_method_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
         job = await async_client.eval.jobs.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert_matches_type(EvaluateResponse, job, path=["response"])
 
@@ -178,7 +178,7 @@ async def test_method_retrieve(self, async_client: AsyncLlamaStackClient) -> Non
     async def test_raw_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.eval.jobs.with_raw_response.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -190,7 +190,7 @@ async def test_raw_response_retrieve(self, async_client: AsyncLlamaStackClient)
     async def test_streaming_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.eval.jobs.with_streaming_response.retrieve(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -202,23 +202,23 @@ async def test_streaming_response_retrieve(self, async_client: AsyncLlamaStackCl
 
     @parametrize
     async def test_path_params_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.retrieve(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.retrieve(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
 
     @parametrize
     async def test_method_cancel(self, async_client: AsyncLlamaStackClient) -> None:
         job = await async_client.eval.jobs.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert job is None
 
@@ -226,7 +226,7 @@ async def test_method_cancel(self, async_client: AsyncLlamaStackClient) -> None:
     async def test_raw_response_cancel(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.eval.jobs.with_raw_response.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -238,7 +238,7 @@ async def test_raw_response_cancel(self, async_client: AsyncLlamaStackClient) ->
     async def test_streaming_response_cancel(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.eval.jobs.with_streaming_response.cancel(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -250,23 +250,23 @@ async def test_streaming_response_cancel(self, async_client: AsyncLlamaStackClie
 
     @parametrize
     async def test_path_params_cancel(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.cancel(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.cancel(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
 
     @parametrize
     async def test_method_status(self, async_client: AsyncLlamaStackClient) -> None:
         job = await async_client.eval.jobs.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
         assert_matches_type(Optional[JobStatusResponse], job, path=["response"])
 
@@ -274,7 +274,7 @@ async def test_method_status(self, async_client: AsyncLlamaStackClient) -> None:
     async def test_raw_response_status(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.eval.jobs.with_raw_response.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         )
 
         assert response.is_closed is True
@@ -286,7 +286,7 @@ async def test_raw_response_status(self, async_client: AsyncLlamaStackClient) ->
     async def test_streaming_response_status(self, async_client: AsyncLlamaStackClient) -> None:
         async with async_client.eval.jobs.with_streaming_response.status(
             job_id="job_id",
-            task_id="task_id",
+            benchmark_id="benchmark_id",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -298,14 +298,14 @@ async def test_streaming_response_status(self, async_client: AsyncLlamaStackClie
 
     @parametrize
     async def test_path_params_status(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.raises(ValueError, match=r"Expected a non-empty value for `task_id` but received ''"):
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.status(
                 job_id="job_id",
-                task_id="",
+                benchmark_id="",
             )
 
         with pytest.raises(ValueError, match=r"Expected a non-empty value for `job_id` but received ''"):
             await async_client.eval.jobs.with_raw_response.status(
                 job_id="",
-                task_id="task_id",
+                benchmark_id="benchmark_id",
             )
diff --git a/tests/api_resources/test_benchmarks.py b/tests/api_resources/test_benchmarks.py
new file mode 100644
index 00000000..03aceead
--- /dev/null
+++ b/tests/api_resources/test_benchmarks.py
@@ -0,0 +1,246 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from __future__ import annotations
+
+import os
+from typing import Any, Optional, cast
+
+import pytest
+
+from tests.utils import assert_matches_type
+from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
+from llama_stack_client.types import Benchmark, BenchmarkListResponse
+
+base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
+
+
+class TestBenchmarks:
+    parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    def test_method_retrieve(self, client: LlamaStackClient) -> None:
+        benchmark = client.benchmarks.retrieve(
+            "benchmark_id",
+        )
+        assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+    @parametrize
+    def test_raw_response_retrieve(self, client: LlamaStackClient) -> None:
+        response = client.benchmarks.with_raw_response.retrieve(
+            "benchmark_id",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = response.parse()
+        assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+    @parametrize
+    def test_streaming_response_retrieve(self, client: LlamaStackClient) -> None:
+        with client.benchmarks.with_streaming_response.retrieve(
+            "benchmark_id",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = response.parse()
+            assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_path_params_retrieve(self, client: LlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            client.benchmarks.with_raw_response.retrieve(
+                "",
+            )
+
+    @parametrize
+    def test_method_list(self, client: LlamaStackClient) -> None:
+        benchmark = client.benchmarks.list()
+        assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+    @parametrize
+    def test_raw_response_list(self, client: LlamaStackClient) -> None:
+        response = client.benchmarks.with_raw_response.list()
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = response.parse()
+        assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+    @parametrize
+    def test_streaming_response_list(self, client: LlamaStackClient) -> None:
+        with client.benchmarks.with_streaming_response.list() as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = response.parse()
+            assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_method_register(self, client: LlamaStackClient) -> None:
+        benchmark = client.benchmarks.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        )
+        assert benchmark is None
+
+    @parametrize
+    def test_method_register_with_all_params(self, client: LlamaStackClient) -> None:
+        benchmark = client.benchmarks.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+            metadata={"foo": True},
+            provider_benchmark_id="provider_benchmark_id",
+            provider_id="provider_id",
+        )
+        assert benchmark is None
+
+    @parametrize
+    def test_raw_response_register(self, client: LlamaStackClient) -> None:
+        response = client.benchmarks.with_raw_response.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = response.parse()
+        assert benchmark is None
+
+    @parametrize
+    def test_streaming_response_register(self, client: LlamaStackClient) -> None:
+        with client.benchmarks.with_streaming_response.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = response.parse()
+            assert benchmark is None
+
+        assert cast(Any, response.is_closed) is True
+
+
+class TestAsyncBenchmarks:
+    parametrize = pytest.mark.parametrize("async_client", [False, True], indirect=True, ids=["loose", "strict"])
+
+    @parametrize
+    async def test_method_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
+        benchmark = await async_client.benchmarks.retrieve(
+            "benchmark_id",
+        )
+        assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+    @parametrize
+    async def test_raw_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.benchmarks.with_raw_response.retrieve(
+            "benchmark_id",
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = await response.parse()
+        assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.benchmarks.with_streaming_response.retrieve(
+            "benchmark_id",
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = await response.parse()
+            assert_matches_type(Optional[Benchmark], benchmark, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_path_params_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            await async_client.benchmarks.with_raw_response.retrieve(
+                "",
+            )
+
+    @parametrize
+    async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None:
+        benchmark = await async_client.benchmarks.list()
+        assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+    @parametrize
+    async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.benchmarks.with_raw_response.list()
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = await response.parse()
+        assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.benchmarks.with_streaming_response.list() as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = await response.parse()
+            assert_matches_type(BenchmarkListResponse, benchmark, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_method_register(self, async_client: AsyncLlamaStackClient) -> None:
+        benchmark = await async_client.benchmarks.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        )
+        assert benchmark is None
+
+    @parametrize
+    async def test_method_register_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        benchmark = await async_client.benchmarks.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+            metadata={"foo": True},
+            provider_benchmark_id="provider_benchmark_id",
+            provider_id="provider_id",
+        )
+        assert benchmark is None
+
+    @parametrize
+    async def test_raw_response_register(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.benchmarks.with_raw_response.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        benchmark = await response.parse()
+        assert benchmark is None
+
+    @parametrize
+    async def test_streaming_response_register(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.benchmarks.with_streaming_response.register(
+            benchmark_id="benchmark_id",
+            dataset_id="dataset_id",
+            scoring_functions=["string"],
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            benchmark = await response.parse()
+            assert benchmark is None
+
+        assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_eval.py b/tests/api_resources/test_eval.py
index 52556bf2..de5d0cac 100644
--- a/tests/api_resources/test_eval.py
+++ b/tests/api_resources/test_eval.py
@@ -32,6 +32,12 @@ def test_method_evaluate_rows(self, client: LlamaStackClient) -> None:
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -57,6 +63,15 @@ def test_method_evaluate_rows_with_all_params(self, client: LlamaStackClient) ->
                         "role": "system",
                     },
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
                 "type": "benchmark",
                 "num_examples": 0,
             },
@@ -75,6 +90,12 @@ def test_raw_response_evaluate_rows(self, client: LlamaStackClient) -> None:
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -96,6 +117,12 @@ def test_streaming_response_evaluate_rows(self, client: LlamaStackClient) -> Non
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         ) as response:
@@ -120,6 +147,149 @@ def test_path_params_evaluate_rows(self, client: LlamaStackClient) -> None:
                         "sampling_params": {"strategy": {"type": "greedy"}},
                         "type": "model",
                     },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
+                    "type": "benchmark",
+                },
+            )
+
+    @parametrize
+    def test_method_evaluate_rows_alpha(self, client: LlamaStackClient) -> None:
+        eval = client.eval.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    def test_method_evaluate_rows_alpha_with_all_params(self, client: LlamaStackClient) -> None:
+        eval = client.eval.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {
+                        "strategy": {"type": "greedy"},
+                        "max_tokens": 0,
+                        "repetition_penalty": 0,
+                    },
+                    "type": "model",
+                    "system_message": {
+                        "content": "string",
+                        "role": "system",
+                    },
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
+                "type": "benchmark",
+                "num_examples": 0,
+            },
+        )
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    def test_raw_response_evaluate_rows_alpha(self, client: LlamaStackClient) -> None:
+        response = client.eval.with_raw_response.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        eval = response.parse()
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    def test_streaming_response_evaluate_rows_alpha(self, client: LlamaStackClient) -> None:
+        with client.eval.with_streaming_response.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            eval = response.parse()
+            assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_path_params_evaluate_rows_alpha(self, client: LlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            client.eval.with_raw_response.evaluate_rows_alpha(
+                benchmark_id="",
+                input_rows=[{"foo": True}],
+                scoring_functions=["string"],
+                task_config={
+                    "eval_candidate": {
+                        "model": "model",
+                        "sampling_params": {"strategy": {"type": "greedy"}},
+                        "type": "model",
+                    },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
                     "type": "benchmark",
                 },
             )
@@ -134,6 +304,12 @@ def test_method_run_eval(self, client: LlamaStackClient) -> None:
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -157,6 +333,15 @@ def test_method_run_eval_with_all_params(self, client: LlamaStackClient) -> None
                         "role": "system",
                     },
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
                 "type": "benchmark",
                 "num_examples": 0,
             },
@@ -173,6 +358,12 @@ def test_raw_response_run_eval(self, client: LlamaStackClient) -> None:
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -192,6 +383,12 @@ def test_streaming_response_run_eval(self, client: LlamaStackClient) -> None:
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         ) as response:
@@ -214,6 +411,139 @@ def test_path_params_run_eval(self, client: LlamaStackClient) -> None:
                         "sampling_params": {"strategy": {"type": "greedy"}},
                         "type": "model",
                     },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
+                    "type": "benchmark",
+                },
+            )
+
+    @parametrize
+    def test_method_run_eval_alpha(self, client: LlamaStackClient) -> None:
+        eval = client.eval.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    def test_method_run_eval_alpha_with_all_params(self, client: LlamaStackClient) -> None:
+        eval = client.eval.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {
+                        "strategy": {"type": "greedy"},
+                        "max_tokens": 0,
+                        "repetition_penalty": 0,
+                    },
+                    "type": "model",
+                    "system_message": {
+                        "content": "string",
+                        "role": "system",
+                    },
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
+                "type": "benchmark",
+                "num_examples": 0,
+            },
+        )
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    def test_raw_response_run_eval_alpha(self, client: LlamaStackClient) -> None:
+        response = client.eval.with_raw_response.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        eval = response.parse()
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    def test_streaming_response_run_eval_alpha(self, client: LlamaStackClient) -> None:
+        with client.eval.with_streaming_response.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            eval = response.parse()
+            assert_matches_type(Job, eval, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    def test_path_params_run_eval_alpha(self, client: LlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            client.eval.with_raw_response.run_eval_alpha(
+                benchmark_id="",
+                task_config={
+                    "eval_candidate": {
+                        "model": "model",
+                        "sampling_params": {"strategy": {"type": "greedy"}},
+                        "type": "model",
+                    },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
                     "type": "benchmark",
                 },
             )
@@ -234,6 +564,12 @@ async def test_method_evaluate_rows(self, async_client: AsyncLlamaStackClient) -
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -259,6 +595,15 @@ async def test_method_evaluate_rows_with_all_params(self, async_client: AsyncLla
                         "role": "system",
                     },
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
                 "type": "benchmark",
                 "num_examples": 0,
             },
@@ -277,6 +622,12 @@ async def test_raw_response_evaluate_rows(self, async_client: AsyncLlamaStackCli
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -298,6 +649,12 @@ async def test_streaming_response_evaluate_rows(self, async_client: AsyncLlamaSt
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         ) as response:
@@ -322,6 +679,149 @@ async def test_path_params_evaluate_rows(self, async_client: AsyncLlamaStackClie
                         "sampling_params": {"strategy": {"type": "greedy"}},
                         "type": "model",
                     },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
+                    "type": "benchmark",
+                },
+            )
+
+    @parametrize
+    async def test_method_evaluate_rows_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        eval = await async_client.eval.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    async def test_method_evaluate_rows_alpha_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        eval = await async_client.eval.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {
+                        "strategy": {"type": "greedy"},
+                        "max_tokens": 0,
+                        "repetition_penalty": 0,
+                    },
+                    "type": "model",
+                    "system_message": {
+                        "content": "string",
+                        "role": "system",
+                    },
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
+                "type": "benchmark",
+                "num_examples": 0,
+            },
+        )
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    async def test_raw_response_evaluate_rows_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.eval.with_raw_response.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        eval = await response.parse()
+        assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_evaluate_rows_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.eval.with_streaming_response.evaluate_rows_alpha(
+            benchmark_id="benchmark_id",
+            input_rows=[{"foo": True}],
+            scoring_functions=["string"],
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            eval = await response.parse()
+            assert_matches_type(EvaluateResponse, eval, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_path_params_evaluate_rows_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            await async_client.eval.with_raw_response.evaluate_rows_alpha(
+                benchmark_id="",
+                input_rows=[{"foo": True}],
+                scoring_functions=["string"],
+                task_config={
+                    "eval_candidate": {
+                        "model": "model",
+                        "sampling_params": {"strategy": {"type": "greedy"}},
+                        "type": "model",
+                    },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
                     "type": "benchmark",
                 },
             )
@@ -336,6 +836,12 @@ async def test_method_run_eval(self, async_client: AsyncLlamaStackClient) -> Non
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -359,6 +865,15 @@ async def test_method_run_eval_with_all_params(self, async_client: AsyncLlamaSta
                         "role": "system",
                     },
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
                 "type": "benchmark",
                 "num_examples": 0,
             },
@@ -375,6 +890,12 @@ async def test_raw_response_run_eval(self, async_client: AsyncLlamaStackClient)
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         )
@@ -394,6 +915,12 @@ async def test_streaming_response_run_eval(self, async_client: AsyncLlamaStackCl
                     "sampling_params": {"strategy": {"type": "greedy"}},
                     "type": "model",
                 },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
                 "type": "benchmark",
             },
         ) as response:
@@ -416,6 +943,139 @@ async def test_path_params_run_eval(self, async_client: AsyncLlamaStackClient) -
                         "sampling_params": {"strategy": {"type": "greedy"}},
                         "type": "model",
                     },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
+                    "type": "benchmark",
+                },
+            )
+
+    @parametrize
+    async def test_method_run_eval_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        eval = await async_client.eval.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    async def test_method_run_eval_alpha_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        eval = await async_client.eval.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {
+                        "strategy": {"type": "greedy"},
+                        "max_tokens": 0,
+                        "repetition_penalty": 0,
+                    },
+                    "type": "model",
+                    "system_message": {
+                        "content": "string",
+                        "role": "system",
+                    },
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                        "aggregation_functions": ["average"],
+                        "judge_score_regexes": ["string"],
+                        "prompt_template": "prompt_template",
+                    }
+                },
+                "type": "benchmark",
+                "num_examples": 0,
+            },
+        )
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    async def test_raw_response_run_eval_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        response = await async_client.eval.with_raw_response.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        )
+
+        assert response.is_closed is True
+        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+        eval = await response.parse()
+        assert_matches_type(Job, eval, path=["response"])
+
+    @parametrize
+    async def test_streaming_response_run_eval_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        async with async_client.eval.with_streaming_response.run_eval_alpha(
+            benchmark_id="benchmark_id",
+            task_config={
+                "eval_candidate": {
+                    "model": "model",
+                    "sampling_params": {"strategy": {"type": "greedy"}},
+                    "type": "model",
+                },
+                "scoring_params": {
+                    "foo": {
+                        "judge_model": "judge_model",
+                        "type": "llm_as_judge",
+                    }
+                },
+                "type": "benchmark",
+            },
+        ) as response:
+            assert not response.is_closed
+            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
+
+            eval = await response.parse()
+            assert_matches_type(Job, eval, path=["response"])
+
+        assert cast(Any, response.is_closed) is True
+
+    @parametrize
+    async def test_path_params_run_eval_alpha(self, async_client: AsyncLlamaStackClient) -> None:
+        with pytest.raises(ValueError, match=r"Expected a non-empty value for `benchmark_id` but received ''"):
+            await async_client.eval.with_raw_response.run_eval_alpha(
+                benchmark_id="",
+                task_config={
+                    "eval_candidate": {
+                        "model": "model",
+                        "sampling_params": {"strategy": {"type": "greedy"}},
+                        "type": "model",
+                    },
+                    "scoring_params": {
+                        "foo": {
+                            "judge_model": "judge_model",
+                            "type": "llm_as_judge",
+                        }
+                    },
                     "type": "benchmark",
                 },
             )
diff --git a/tests/api_resources/test_eval_tasks.py b/tests/api_resources/test_eval_tasks.py
index 5b18621b..6ca2f2c4 100644
--- a/tests/api_resources/test_eval_tasks.py
+++ b/tests/api_resources/test_eval_tasks.py
@@ -9,7 +9,7 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types import EvalTask, EvalTaskListResponse
+from llama_stack_client.types import Benchmark, BenchmarkListResponse
 
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
@@ -22,7 +22,7 @@ def test_method_retrieve(self, client: LlamaStackClient) -> None:
         eval_task = client.eval_tasks.retrieve(
             "eval_task_id",
         )
-        assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+        assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
     @parametrize
     def test_raw_response_retrieve(self, client: LlamaStackClient) -> None:
@@ -33,7 +33,7 @@ def test_raw_response_retrieve(self, client: LlamaStackClient) -> None:
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         eval_task = response.parse()
-        assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+        assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
     @parametrize
     def test_streaming_response_retrieve(self, client: LlamaStackClient) -> None:
@@ -44,7 +44,7 @@ def test_streaming_response_retrieve(self, client: LlamaStackClient) -> None:
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             eval_task = response.parse()
-            assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+            assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -58,7 +58,7 @@ def test_path_params_retrieve(self, client: LlamaStackClient) -> None:
     @parametrize
     def test_method_list(self, client: LlamaStackClient) -> None:
         eval_task = client.eval_tasks.list()
-        assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+        assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
     @parametrize
     def test_raw_response_list(self, client: LlamaStackClient) -> None:
@@ -67,7 +67,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None:
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         eval_task = response.parse()
-        assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+        assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
     @parametrize
     def test_streaming_response_list(self, client: LlamaStackClient) -> None:
@@ -76,7 +76,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None:
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             eval_task = response.parse()
-            assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+            assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -96,7 +96,7 @@ def test_method_register_with_all_params(self, client: LlamaStackClient) -> None
             eval_task_id="eval_task_id",
             scoring_functions=["string"],
             metadata={"foo": True},
-            provider_eval_task_id="provider_eval_task_id",
+            provider_benchmark_id="provider_benchmark_id",
             provider_id="provider_id",
         )
         assert eval_task is None
@@ -138,7 +138,7 @@ async def test_method_retrieve(self, async_client: AsyncLlamaStackClient) -> Non
         eval_task = await async_client.eval_tasks.retrieve(
             "eval_task_id",
         )
-        assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+        assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
     @parametrize
     async def test_raw_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
@@ -149,7 +149,7 @@ async def test_raw_response_retrieve(self, async_client: AsyncLlamaStackClient)
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         eval_task = await response.parse()
-        assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+        assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
     @parametrize
     async def test_streaming_response_retrieve(self, async_client: AsyncLlamaStackClient) -> None:
@@ -160,7 +160,7 @@ async def test_streaming_response_retrieve(self, async_client: AsyncLlamaStackCl
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             eval_task = await response.parse()
-            assert_matches_type(Optional[EvalTask], eval_task, path=["response"])
+            assert_matches_type(Optional[Benchmark], eval_task, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -174,7 +174,7 @@ async def test_path_params_retrieve(self, async_client: AsyncLlamaStackClient) -
     @parametrize
     async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None:
         eval_task = await async_client.eval_tasks.list()
-        assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+        assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
     @parametrize
     async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -183,7 +183,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         eval_task = await response.parse()
-        assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+        assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
     @parametrize
     async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -192,7 +192,7 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             eval_task = await response.parse()
-            assert_matches_type(EvalTaskListResponse, eval_task, path=["response"])
+            assert_matches_type(BenchmarkListResponse, eval_task, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -212,7 +212,7 @@ async def test_method_register_with_all_params(self, async_client: AsyncLlamaSta
             eval_task_id="eval_task_id",
             scoring_functions=["string"],
             metadata={"foo": True},
-            provider_eval_task_id="provider_eval_task_id",
+            provider_benchmark_id="provider_benchmark_id",
             provider_id="provider_id",
         )
         assert eval_task is None
diff --git a/tests/api_resources/test_telemetry.py b/tests/api_resources/test_telemetry.py
index 99886c2d..4f3c81d4 100644
--- a/tests/api_resources/test_telemetry.py
+++ b/tests/api_resources/test_telemetry.py
@@ -182,7 +182,7 @@ def test_method_log_event_with_all_params(self, client: LlamaStackClient) -> Non
                 "timestamp": parse_datetime("2019-12-27T18:11:19.117Z"),
                 "trace_id": "trace_id",
                 "type": "unstructured_log",
-                "attributes": {"foo": True},
+                "attributes": {"foo": "string"},
             },
             ttl_seconds=0,
         )
@@ -577,7 +577,7 @@ async def test_method_log_event_with_all_params(self, async_client: AsyncLlamaSt
                 "timestamp": parse_datetime("2019-12-27T18:11:19.117Z"),
                 "trace_id": "trace_id",
                 "type": "unstructured_log",
-                "attributes": {"foo": True},
+                "attributes": {"foo": "string"},
             },
             ttl_seconds=0,
         )
diff --git a/tests/test_client.py b/tests/test_client.py
index 3ea5f0b7..f282f616 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -23,6 +23,7 @@
 
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient, APIResponseValidationError
 from llama_stack_client._types import Omit
+from llama_stack_client._utils import maybe_transform
 from llama_stack_client._models import BaseModel, FinalRequestOptions
 from llama_stack_client._constants import RAW_RESPONSE_HEADER
 from llama_stack_client._exceptions import APIStatusError, APITimeoutError, APIResponseValidationError
@@ -32,6 +33,7 @@
     BaseClient,
     make_request_options,
 )
+from llama_stack_client.types.inference_chat_completion_params import InferenceChatCompletionParamsNonStreaming
 
 from .utils import update_env
 
@@ -686,14 +688,17 @@ def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter) -> No
                 "/v1/inference/chat-completion",
                 body=cast(
                     object,
-                    dict(
-                        messages=[
-                            {
-                                "content": "string",
-                                "role": "user",
-                            }
-                        ],
-                        model_id="model_id",
+                    maybe_transform(
+                        dict(
+                            messages=[
+                                {
+                                    "content": "string",
+                                    "role": "user",
+                                }
+                            ],
+                            model_id="model_id",
+                        ),
+                        InferenceChatCompletionParamsNonStreaming,
                     ),
                 ),
                 cast_to=httpx.Response,
@@ -712,14 +717,17 @@ def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter) -> Non
                 "/v1/inference/chat-completion",
                 body=cast(
                     object,
-                    dict(
-                        messages=[
-                            {
-                                "content": "string",
-                                "role": "user",
-                            }
-                        ],
-                        model_id="model_id",
+                    maybe_transform(
+                        dict(
+                            messages=[
+                                {
+                                    "content": "string",
+                                    "role": "user",
+                                }
+                            ],
+                            model_id="model_id",
+                        ),
+                        InferenceChatCompletionParamsNonStreaming,
                     ),
                 ),
                 cast_to=httpx.Response,
@@ -1474,14 +1482,17 @@ async def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter)
                 "/v1/inference/chat-completion",
                 body=cast(
                     object,
-                    dict(
-                        messages=[
-                            {
-                                "content": "string",
-                                "role": "user",
-                            }
-                        ],
-                        model_id="model_id",
+                    maybe_transform(
+                        dict(
+                            messages=[
+                                {
+                                    "content": "string",
+                                    "role": "user",
+                                }
+                            ],
+                            model_id="model_id",
+                        ),
+                        InferenceChatCompletionParamsNonStreaming,
                     ),
                 ),
                 cast_to=httpx.Response,
@@ -1500,14 +1511,17 @@ async def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter)
                 "/v1/inference/chat-completion",
                 body=cast(
                     object,
-                    dict(
-                        messages=[
-                            {
-                                "content": "string",
-                                "role": "user",
-                            }
-                        ],
-                        model_id="model_id",
+                    maybe_transform(
+                        dict(
+                            messages=[
+                                {
+                                    "content": "string",
+                                    "role": "user",
+                                }
+                            ],
+                            model_id="model_id",
+                        ),
+                        InferenceChatCompletionParamsNonStreaming,
                     ),
                 ),
                 cast_to=httpx.Response,
diff --git a/tests/test_transform.py b/tests/test_transform.py
index 364c685e..8ceafb36 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -2,7 +2,7 @@
 
 import io
 import pathlib
-from typing import Any, List, Union, TypeVar, Iterable, Optional, cast
+from typing import Any, Dict, List, Union, TypeVar, Iterable, Optional, cast
 from datetime import date, datetime
 from typing_extensions import Required, Annotated, TypedDict
 
@@ -388,6 +388,15 @@ def my_iter() -> Iterable[Baz8]:
     }
 
 
+@parametrize
+@pytest.mark.asyncio
+async def test_dictionary_items(use_async: bool) -> None:
+    class DictItems(TypedDict):
+        foo_baz: Annotated[str, PropertyInfo(alias="fooBaz")]
+
+    assert await transform({"foo": {"foo_baz": "bar"}}, Dict[str, DictItems], use_async) == {"foo": {"fooBaz": "bar"}}
+
+
 class TypedDictIterableUnionStr(TypedDict):
     foo: Annotated[Union[str, Iterable[Baz8]], PropertyInfo(alias="FOO")]