Skip to content

Commit abdb8e0

Browse files
authored
make all graders use their own credential-based client (#40774)
1 parent 20244cf commit abdb8e0

File tree

2 files changed

+16
-71
lines changed

2 files changed

+16
-71
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -844,7 +844,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
844844
input_data_df = validated_data["input_data_df"]
845845
results_df = pd.DataFrame()
846846
metrics: Dict[str, float] = {}
847-
oai_client: Optional[Union[OpenAI, AzureOpenAI]] = None
848847
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
849848

850849
# Start OAI eval runs if any graders are present.
@@ -853,14 +852,9 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
853852
need_get_oai_results = False
854853
got_local_results = False
855854
if need_oai_run:
856-
for grader in graders.values():
857-
if isinstance(grader, AzureOpenAIGrader):
858-
oai_client = grader.get_client()
859-
break
860855
try:
861856
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
862857
eval_run_info_list = _begin_aoai_evaluation(
863-
oai_client, # type: ignore
864858
graders,
865859
column_mapping,
866860
input_data_df,
@@ -900,7 +894,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
900894
# Retrieve OAI eval run results if needed.
901895
if need_get_oai_results:
902896
try:
903-
aoai_results, aoai_metrics = _get_evaluation_run_results(oai_client, eval_run_info_list) # type: ignore
897+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
904898
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
905899

906900
# Combine results if both evaluators and graders are present

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py

+15-64
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
class OAIEvalRunCreationInfo(TypedDict, total=True):
2626
"""Configuration for an evaluator"""
2727

28+
client: Union[AzureOpenAI, OpenAI]
2829
eval_group_id: str
2930
eval_run_id: str
3031
grader_name_map: Dict[str, str]
@@ -55,7 +56,6 @@ def _split_evaluators_and_grader_configs(
5556

5657
@experimental
5758
def _begin_aoai_evaluation(
58-
client: Union[OpenAI, AzureOpenAI],
5959
graders: Dict[str, AzureOpenAIGrader],
6060
column_mappings: Optional[Dict[str, Dict[str, str]]],
6161
data: pd.DataFrame,
@@ -88,11 +88,8 @@ def _begin_aoai_evaluation(
8888
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
8989
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
9090

91-
if len(all_eval_run_info) > 1:
92-
LOGGER.info("AOAI: Grader-specific column mappings detected. Splitting up evaluation runs to avoid conflicts...")
9391
for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
9492
all_eval_run_info.append(_begin_single_aoai_evaluation(
95-
client,
9693
selected_graders,
9794
data,
9895
selected_column_mapping,
@@ -102,7 +99,6 @@ def _begin_aoai_evaluation(
10299
return all_eval_run_info
103100

104101
def _begin_single_aoai_evaluation(
105-
client: Union[OpenAI, AzureOpenAI],
106102
graders: Dict[str, AzureOpenAIGrader],
107103
data: pd.DataFrame,
108104
column_mapping: Dict[str, str],
@@ -113,8 +109,6 @@ def _begin_single_aoai_evaluation(
113109
AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
114110
at a later time.
115111
116-
:param client: The AOAI client to use for the evaluation.
117-
:type client: Union[OpenAI, AzureOpenAI]
118112
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
119113
:type graders: Dict[str, AoaiGrader]
120114
:param data_source_config: The data source configuration to apply to the
@@ -129,7 +123,10 @@ def _begin_single_aoai_evaluation(
129123
# Format data for eval group creation
130124
grader_name_list = []
131125
grader_list = []
132-
126+
# It's expected that all graders supplied for a single eval run use the same credentials
127+
# so grab a client from the first grader.
128+
client = list(graders.values())[0].get_client()
129+
133130
for name, grader in graders.items():
134131
grader_name_list.append(name)
135132
grader_list.append(grader.get_grader_config())
@@ -163,7 +160,7 @@ def _begin_single_aoai_evaluation(
163160
LOGGER.info(f"AOAI: Eval run created with id {eval_run_id}." +
164161
" Results will be retrieved after normal evaluation is complete...")
165162

166-
return OAIEvalRunCreationInfo(eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
163+
return OAIEvalRunCreationInfo(client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map)
167164

168165
def _get_evaluation_run_results(
169166
client: Union[OpenAI, AzureOpenAI],
@@ -174,8 +171,6 @@ def _get_evaluation_run_results(
174171
pipeline to consume. This method accepts a list of eval run information, and will combine the
175172
results into a single dataframe and metrics dictionary.
176173
177-
:param client: The AOAI client to use for the evaluation.
178-
:type client: Union[OpenAI, AzureOpenAI]
179174
:param all_run_info: A list of evaluation run information that contains the needed values
180175
to retrieve the results of the evaluation run.
181176
:type all_run_info: List[OAIEvalRunCreationInfo]
@@ -188,25 +183,19 @@ def _get_evaluation_run_results(
188183
run_metrics = {}
189184
output_df = pd.DataFrame()
190185
for run_info in all_run_info:
191-
cur_output_df, cur_run_metrics = _get_single_run_results(
192-
client,
193-
run_info
194-
)
186+
cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
195187
output_df = pd.concat([output_df, cur_output_df], axis=1)
196188
run_metrics.update(cur_run_metrics)
197189

198190
return output_df, run_metrics
199191

200192
def _get_single_run_results(
201-
client: Union[OpenAI, AzureOpenAI],
202193
run_info: OAIEvalRunCreationInfo,
203194
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
204195
"""
205196
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
206197
pipeline to consume.
207198
208-
:param client: The AOAI client to use for the evaluation.
209-
:type client: Union[OpenAI, AzureOpenAI]
210199
:param run_info: The evaluation run information that contains the needed values
211200
to retrieve the results of the evaluation run.
212201
:type run_info: OAIEvalRunCreationInfo
@@ -216,7 +205,7 @@ def _get_single_run_results(
216205
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
217206
"""
218207
# Wait for evaluation run to complete
219-
run_results = _wait_for_run_conclusion(client, run_info["eval_group_id"], run_info["eval_run_id"])
208+
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
220209
if run_results.status != "completed":
221210
raise EvaluationException(
222211
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -248,7 +237,7 @@ def _get_single_run_results(
248237
# The passed and score values are then added to the results dictionary, prepended with the grader's name
249238
# as entered by the user in the inputted dictionary.
250239
# Other values, if they exist, are also added to the results dictionary.
251-
raw_list_results = client.evals.runs.output_items.list(
240+
raw_list_results = run_info["client"].evals.runs.output_items.list(
252241
eval_id=run_info["eval_group_id"],
253242
run_id=run_info["eval_run_id"]
254243
)
@@ -273,41 +262,6 @@ def _get_single_run_results(
273262

274263
return output_df, run_metrics
275264

276-
def _are_individual_runs_needed(
277-
graders: Dict[str, AzureOpenAIGrader],
278-
column_mapping: Optional[Dict[str, str]] = None
279-
) -> bool:
280-
"""
281-
Given an input set of graders and their column mapping, determine if
282-
the graders can be executed together under a single evaluation run,
283-
or if they must be handled individually.
284-
285-
For simplicity's sake, the individual run condition is met if there are at least
286-
two graders, and if any of them have a unique column mapping.
287-
288-
This is done to avoid the possibility of conflicting mappings, since OAI requires
289-
unique input name assignments to each evaluation group.
290-
291-
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
292-
:type graders: Dict[str, AoaiGrader]
293-
:param column_mapping: The column mapping to check.
294-
:type column_mapping: Optional[Dict[str, str]]
295-
:return: True if the graders require individual runs, False otherwise.
296-
:rtype: bool
297-
"""
298-
if len(graders) < 2:
299-
# Only one grader, no need for individual runs.
300-
return False
301-
if column_mapping is None:
302-
# No column mapping provided, no need for individual runs.
303-
return False
304-
# Check if any of the graders have a unique column mapping.
305-
for name in graders.keys():
306-
if name in column_mapping:
307-
# Grader with a unique column mapping found. Individual runs are needed.
308-
return True
309-
return False
310-
311265

312266
def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str, Any]) -> AzureOpenAIGrader:
313267
"""
@@ -380,6 +334,10 @@ def _get_graders_and_column_mappings(
380334
the OAI API can't. So, if if there's a possibility that such a conflict might arise,
381335
we need to split the incoming data up.
382336
337+
Currently splits each grader into its own eval group/run to ensure they each use
338+
their own credentials later on. Planned fast follow is to group things by
339+
matching credentials later.
340+
383341
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
384342
:type graders: Dict[str, AoaiGrader]
385343
:param column_mappings: The column mappings to use for the evaluation.
@@ -388,15 +346,9 @@ def _get_graders_and_column_mappings(
388346
and the column mapping they should use.
389347
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
390348
"""
391-
if column_mappings is None:
392-
# No column mappings provided, no need to split.
393-
return [(graders, None)]
349+
394350
default_mapping = column_mappings.get("default", None)
395-
if not any(name in column_mappings for name in graders.keys()):
396-
# No unique column mappings provided, no need to split.
397-
return [(graders, default_mapping)]
398-
# At least one grader has a unique column mapping, split graders up
399-
return [({name:grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
351+
return [({name : grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
400352

401353
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
402354
"""Produce a data source config that maps all columns from the supplied data source into
@@ -499,7 +451,6 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
499451
}
500452
}
501453

502-
503454
def _begin_eval_run(
504455
client: Union[OpenAI, AzureOpenAI],
505456
eval_group_id: str,

0 commit comments

Comments
 (0)