Skip to content

Commit

Permalink
feat: Upload dataset URI, metric criteria, and rating rubrics used in…
Browse files Browse the repository at this point in the history
… an evaluation to GCS.

PiperOrigin-RevId: 723940715
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Feb 6, 2025
1 parent d4cae46 commit aabb579
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 1 deletion.
13 changes: 13 additions & 0 deletions tests/unit/vertexai/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2145,6 +2145,8 @@ def test_upload_results(self, mock_storage_blob_from_string):
_TEST_FILE_NAME,
"candidate_model",
"baseline_model",
"gs://test-bucket/test-dataset.csv",
[_TEST_POINTWISE_METRIC, _TEST_PAIRWISE_METRIC],
)

mock_storage_blob_from_string.assert_any_call(
Expand All @@ -2160,6 +2162,17 @@ def test_upload_results(self, mock_storage_blob_from_string):
"summary_metrics": MOCK_EVAL_RESULT.summary_metrics,
"candidate_model_name": "candidate_model",
"baseline_model_name": "baseline_model",
"dataset_uri": "gs://test-bucket/test-dataset.csv",
"metric_descriptions": {
"test_pointwise_metric": {
"criteria": _CRITERIA,
"rating_rubric": _POINTWISE_RATING_RUBRIC,
},
"test_pairwise_metric": {
"criteria": _CRITERIA,
"rating_rubric": _PAIRWISE_RATING_RUBRIC,
},
},
},
mock.ANY,
)
7 changes: 7 additions & 0 deletions vertexai/evaluation/eval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def __init__(
output_uri_prefix: GCS location to store the metrics_table from
evaluation results.
"""
self._raw_dataset = dataset
self._dataset = utils.load_dataset(dataset)
self._metrics = metrics
self._experiment = experiment
Expand Down Expand Up @@ -481,12 +482,18 @@ def evaluate(
if isinstance(baseline_model, generative_models.GenerativeModel):
baseline_model_name = baseline_model._model_name

dataset_uri = None
if isinstance(self._raw_dataset, str):
dataset_uri = self._raw_dataset

utils.upload_evaluation_results(
eval_result,
self.output_uri_prefix,
output_file_name,
candidate_model_name,
baseline_model_name,
dataset_uri,
self.metrics,
)
return eval_result

Expand Down
1 change: 1 addition & 0 deletions vertexai/evaluation/metrics/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(
the model-based evaluation. A freeform string is also accepted.
"""
super().__init__(metric=metric)
self._raw_metric_prompt_template = metric_prompt_template
self.metric_prompt_template = str(metric_prompt_template)


Expand Down
29 changes: 28 additions & 1 deletion vertexai/evaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import tempfile
import threading
import time
from typing import Any, Callable, Dict, Literal, Optional, TYPE_CHECKING, Union
from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union

from google.cloud import bigquery
from google.cloud import storage
Expand All @@ -35,6 +35,10 @@
evaluation_service as gapic_evaluation_services,
)
from vertexai.evaluation import _base as eval_base
from vertexai.evaluation.metrics import (
_base as metrics_base,
metric_prompt_template as metric_prompt_template_base,
)


if TYPE_CHECKING:
Expand Down Expand Up @@ -286,6 +290,8 @@ def _upload_evaluation_summary_to_gcs(
upload_gcs_path: str,
candidate_model_name: Optional[str] = None,
baseline_model_name: Optional[str] = None,
dataset_uri: Optional[str] = None,
metrics: Optional[List[Union[str, metrics_base._Metric]]] = None,
) -> None:
"""Uploads the evaluation summary to a GCS bucket."""
summary = {
Expand All @@ -295,6 +301,21 @@ def _upload_evaluation_summary_to_gcs(
summary["candidate_model_name"] = candidate_model_name
if baseline_model_name:
summary["baseline_model_name"] = baseline_model_name
if dataset_uri:
summary["dataset_uri"] = dataset_uri

if metrics:
metric_descriptions = {}
for metric in metrics:
if isinstance(metric, metrics_base._ModelBasedMetric) and isinstance(
metric._raw_metric_prompt_template,
metric_prompt_template_base._MetricPromptTemplate,
):
metric_descriptions[metric.metric_name] = {
"criteria": metric._raw_metric_prompt_template._criteria,
"rating_rubric": metric._raw_metric_prompt_template._rating_rubric,
}
summary["metric_descriptions"] = metric_descriptions

with tempfile.TemporaryDirectory() as temp_dir:
local_summary_path = os.path.join(temp_dir, "summary_metrics.json")
Expand All @@ -318,6 +339,8 @@ def upload_evaluation_results(
file_name: str,
candidate_model_name: Optional[str] = None,
baseline_model_name: Optional[str] = None,
dataset_uri: Optional[str] = None,
metrics: Optional[List[Union[str, metrics_base._Metric]]] = None,
) -> None:
"""Uploads eval results to GCS destination.
Expand All @@ -327,6 +350,8 @@ def upload_evaluation_results(
file_name: File name to store the metrics table.
candidate_model_name: Optional. Candidate model name.
baseline_model_name: Optional. Baseline model name.
dataset_uri: Optional. URI pointing to the dataset.
metrics: Optional. List of metrics used for evaluation.
"""
if not destination_uri_prefix:
_ipython_utils.display_gen_ai_evaluation_results_button()
Expand All @@ -346,6 +371,8 @@ def upload_evaluation_results(
output_folder + "/summary_metrics.json",
candidate_model_name,
baseline_model_name,
dataset_uri,
metrics,
)
_ipython_utils.display_gen_ai_evaluation_results_button(
metrics_table_path.split(_GCS_PREFIX)[1]
Expand Down

0 comments on commit aabb579

Please sign in to comment.