Users/singankit/upload evaluation run 1rp (#40771)

singankit · w-javed · Copilot · web-flow · commit 20244cf66455 · 2025-04-28T16:42:52.000-07:00
* Upload results for 1DP project

* Calling onedp method for 1dp project

* Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Updating get token to work with autogenerated client

* Fixing failed tests

---------

Co-authored-by: Waqas Javed &lt;7674577+w-javed@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_azure/_clients.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_azure/_clients.py
@@ -201,4 +201,4 @@ def _generate_path(self, *paths: str) -> str:
         return url
 
     def _get_headers(self) -> Dict[str, str]:
-        return {"Authorization": f"Bearer {self.get_token()}", "Content-Type": "application/json"}
+        return {"Authorization": f"Bearer {self.get_token().token}", "Content-Type": "application/json"}
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_azure/_token_manager.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_azure/_token_manager.py
@@ -5,7 +5,7 @@
 import logging
 import time
 import inspect
-from typing import cast, Optional, Union
+from typing import cast, Optional, Union, Any
 
 from azure.core.credentials import TokenCredential, AccessToken
 from azure.identity import AzureCliCredential, DefaultAzureCredential, ManagedIdentityCredential
@@ -71,7 +71,7 @@ def get_aad_credential(self) -> Union[DefaultAzureCredential, ManagedIdentityCre
             # Fall back to using the parent implementation
             return super().get_aad_credential()
 
-    def get_token(self) -> str:
+    def get_token(self, *scopes: str, claims: Union[str, None] = None, tenant_id: Union[str, None] = None, enable_cae: bool = False, **kwargs: Any) -> AccessToken:
         """Get the API token. If the token is not available or has expired, refresh the token.
 
         :return: API token
@@ -82,7 +82,7 @@ def get_token(self) -> str:
             access_token = credential.get_token(self.token_scope)
             self._update_token(access_token)
 
-        return cast(str, self.token)  # check for none is hidden in the _token_needs_update method
+        return self.token  # check for none is hidden in the _token_needs_update method
 
     async def get_token_async(self) -> str:
         """Get the API token asynchronously. If the token is not available or has expired, refresh it.
@@ -112,7 +112,7 @@ def _token_needs_update(self) -> bool:
         )
 
     def _update_token(self, access_token: AccessToken) -> None:
-        self.token = cast(str, access_token.token)
+        self.token = access_token
         self.token_expiry_time = access_token.expires_on
         self.last_refresh_time = time.time()
         self.logger.info("Refreshed Azure management token.")
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/__init__.py
@@ -8,9 +8,14 @@
 from . import constants
 from .rai_service import evaluate_with_rai_service
 from .utils import get_harm_severity_level
+from .evaluation_onedp_client import EvaluationServiceOneDPClient
+from .onedp.models import EvaluationUpload, EvaluationResult
 
 __all__ = [
     "get_harm_severity_level",
     "evaluate_with_rai_service",
     "constants",
+    "EvaluationServiceOneDPClient",
+    "EvaluationResult",
+    "EvaluationUpload",
 ]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/evaluation_onedp_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/evaluation_onedp_client.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 import logging
-from typing import Union, Any
+from typing import Union, Any, Dict
 from azure.core.credentials import AzureKeyCredential, TokenCredential
 from azure.ai.evaluation._common.onedp import AIProjectClient as RestEvaluationServiceClient
 from azure.ai.evaluation._common.onedp.models import (PendingUploadRequest, PendingUploadType, EvaluationResult,
@@ -22,7 +22,7 @@ def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCr
             **kwargs,
         )
 
-    def create_evaluation_result(self, *, name: str, path: str, version=1, **kwargs) -> None:
+    def create_evaluation_result(self, *, name: str, path: str, version=1, metrics: Dict[str, int]=None, **kwargs) -> EvaluationResult:
         """Create and upload evaluation results to Azure evaluation service.
 
         This method uploads evaluation results from a local path to Azure Blob Storage
@@ -38,6 +38,8 @@ def create_evaluation_result(self, *, name: str, path: str, version=1, **kwargs)
         :type path: str
         :param version: The version number for the evaluation results, defaults to 1
         :type version: int, optional
+        :param metrics: Metrics to be added to evaluation result
+        :type version: Dict[str, int], optional
         :param kwargs: Additional keyword arguments to pass to the underlying API calls
         :return: The response from creating the evaluation result version
         :rtype: EvaluationResult
@@ -58,12 +60,13 @@ def create_evaluation_result(self, *, name: str, path: str, version=1, **kwargs)
             upload(path=path, container_client=container_client, logger=LOGGER)
 
         LOGGER.debug(f"Creating evaluation result version for {name} with version {version}")
-        create_version_response = self.rest_client.evaluation_results.create_version(
+        create_version_response = self.rest_client.evaluation_results.create_or_update_version(
             body=EvaluationResult(
                 blob_uri=start_pending_upload_response.blob_reference_for_consumption.blob_uri,
                 result_type=ResultType.EVALUATION,
                 name=name,
-                version=version
+                version=version,
+                metrics=metrics,
             ),
             name=name,
             version=version,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -80,6 +80,12 @@ class _AggregationType(enum.Enum):
     SUM = "sum"
     CUSTOM = "custom"
 
+class TokenScope(enum.Enum):
+    """Defines the scope of the token used to access Azure resources."""
+
+    DEFAULT_AZURE_MANAGEMENT = "https://management.azure.com/.default"
+    COGNITIVE_SERVICES = "https://cognitiveservices.azure.com/.default"
+
 
 DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_eval_run.py
@@ -295,7 +295,7 @@ def get_metrics_url(self):
         return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric"
 
     def _get_token(self) -> str:
-        return self._management_client.get_token()
+        return self._management_client.get_token().token
 
     def request_with_retry(
         self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -43,7 +43,7 @@
     _log_metrics_and_instance_results,
     _trace_destination_from_project_scope,
     _write_output,
-    DataLoaderFactory,
+    DataLoaderFactory, _log_metrics_and_instance_results_onedp,
 )
 from ._batch_run.batch_clients import BatchClient, BatchClientRun
 
@@ -920,15 +920,19 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
                 raise e
 
     # Done with all evaluations, message outputs into final forms, and log results if needed.
-
-    # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
-    trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
-    studio_url = None
-    if trace_destination:
-        name_map = _map_names_to_builtins(evaluators, graders)
-        studio_url = _log_metrics_and_instance_results(
-            metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
+    name_map = _map_names_to_builtins(evaluators, graders)
+    if isinstance(azure_ai_project, str):
+        studio_url = _log_metrics_and_instance_results_onedp(
+            metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
         )
+    else:
+        # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
+        trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
+        studio_url = None
+        if trace_destination:
+            studio_url = _log_metrics_and_instance_results(
+                metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
+            )
 
     result_df_dict = results_df.to_dict("records")
     result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
@@ -1125,7 +1129,6 @@ def _run_callable_evaluators(
     # will be marked as outputs already so we do not need to rename them.
 
     input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
-
     eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
     eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
     eval_metrics.update(evaluators_metric)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py
@@ -126,6 +126,82 @@ def process_message_content(content, images_folder_path):
             f.write(image_data_binary)
     return None
 
+def _log_metrics_and_instance_results_onedp(
+    metrics: Dict[str, Any],
+    instance_results: pd.DataFrame,
+    project_url: str,
+    evaluation_name: Optional[str],
+    name_map: Dict[str, str],
+    **kwargs,
+) -> Optional[str]:
+
+    # One RP Client
+    from azure.ai.evaluation._azure._token_manager import AzureMLTokenManager
+    from azure.ai.evaluation._constants import TokenScope
+    from azure.ai.evaluation._common import EvaluationServiceOneDPClient, EvaluationUpload
+
+    credentials = AzureMLTokenManager(
+        TokenScope.COGNITIVE_SERVICES.value, LOGGER, credential=kwargs.get("credential")
+    )
+    client = EvaluationServiceOneDPClient(
+        endpoint=project_url,
+        credential=credentials
+    )
+
+    # Massaging before artifacts are put on disk
+    # Adding line_number as index column this is needed by UI to form link to individual instance run
+    instance_results["line_number"] = instance_results.index.values
+
+    artifact_name = "instance_results.jsonl"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # storing multi_modal images if exists
+        col_name = "inputs.conversation"
+        if col_name in instance_results.columns:
+            for item in instance_results[col_name].items():
+                value = item[1]
+                if "messages" in value:
+                    _store_multimodal_content(value["messages"], tmpdir)
+
+        # storing artifact result
+        tmp_path = os.path.join(tmpdir, artifact_name)
+
+        with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
+            f.write(instance_results.to_json(orient="records", lines=True))
+
+        properties = {
+            EvaluationRunProperties.RUN_TYPE: "eval_run",
+            EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
+            EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
+            EvaluationRunProperties.NAME_MAP: json.dumps(name_map),
+            "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
+        }
+
+        create_evaluation_result_response = client.create_evaluation_result(
+            name=uuid.uuid4(),
+            path=tmp_path,
+            metrics=metrics
+        )
+
+        upload_run_response = client.start_evaluation_run(
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+            )
+        )
+
+        update_run_response = client.update_evaluation_run(
+            name=upload_run_response.id,
+            evaluation=EvaluationUpload(
+                display_name=evaluation_name,
+                status="Completed",
+                outputs={
+                    'evaluationResultId': create_evaluation_result_response.id,
+                },
+                properties=properties,
+            )
+        )
+
+    return update_run_response.properties.get("AiStudioEvaluationUri")
 
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_lite_management_client.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_lite_management_client.py
@@ -1,7 +1,7 @@
 from typing import Any, Mapping
 import pytest
 import logging
-from azure.core.credentials import AzureSasCredential, TokenCredential
+from azure.core.credentials import AzureSasCredential, TokenCredential, AccessToken
 from azure.ai.evaluation._azure._clients import LiteMLClient
 
 
@@ -31,7 +31,7 @@ def test_get_token(self, project_scope, azure_cred):
         )
 
         token = client.get_token()
-        assert isinstance(token, str) and len(token) > 0
+        assert isinstance(token, AccessToken) and len(token) > 0
 
     @pytest.mark.azuretest
     @pytest.mark.parametrize("include_credentials", [False, True])