25
25
class OAIEvalRunCreationInfo (TypedDict , total = True ):
26
26
"""Configuration for an evaluator"""
27
27
28
+ client : Union [AzureOpenAI , OpenAI ]
28
29
eval_group_id : str
29
30
eval_run_id : str
30
31
grader_name_map : Dict [str , str ]
@@ -55,7 +56,6 @@ def _split_evaluators_and_grader_configs(
55
56
56
57
@experimental
57
58
def _begin_aoai_evaluation (
58
- client : Union [OpenAI , AzureOpenAI ],
59
59
graders : Dict [str , AzureOpenAIGrader ],
60
60
column_mappings : Optional [Dict [str , Dict [str , str ]]],
61
61
data : pd .DataFrame ,
@@ -88,11 +88,8 @@ def _begin_aoai_evaluation(
88
88
LOGGER .info ("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group..." )
89
89
all_eval_run_info : List [OAIEvalRunCreationInfo ] = []
90
90
91
- if len (all_eval_run_info ) > 1 :
92
- LOGGER .info ("AOAI: Grader-specific column mappings detected. Splitting up evaluation runs to avoid conflicts..." )
93
91
for selected_graders , selected_column_mapping in _get_graders_and_column_mappings (graders , column_mappings ):
94
92
all_eval_run_info .append (_begin_single_aoai_evaluation (
95
- client ,
96
93
selected_graders ,
97
94
data ,
98
95
selected_column_mapping ,
@@ -102,7 +99,6 @@ def _begin_aoai_evaluation(
102
99
return all_eval_run_info
103
100
104
101
def _begin_single_aoai_evaluation (
105
- client : Union [OpenAI , AzureOpenAI ],
106
102
graders : Dict [str , AzureOpenAIGrader ],
107
103
data : pd .DataFrame ,
108
104
column_mapping : Dict [str , str ],
@@ -113,8 +109,6 @@ def _begin_single_aoai_evaluation(
113
109
AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
114
110
at a later time.
115
111
116
- :param client: The AOAI client to use for the evaluation.
117
- :type client: Union[OpenAI, AzureOpenAI]
118
112
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
119
113
:type graders: Dict[str, AoaiGrader]
120
114
:param data_source_config: The data source configuration to apply to the
@@ -129,7 +123,10 @@ def _begin_single_aoai_evaluation(
129
123
# Format data for eval group creation
130
124
grader_name_list = []
131
125
grader_list = []
132
-
126
+ # It's expected that all graders supplied for a single eval run use the same credentials
127
+ # so grab a client from the first grader.
128
+ client = list (graders .values ())[0 ].get_client ()
129
+
133
130
for name , grader in graders .items ():
134
131
grader_name_list .append (name )
135
132
grader_list .append (grader .get_grader_config ())
@@ -163,7 +160,7 @@ def _begin_single_aoai_evaluation(
163
160
LOGGER .info (f"AOAI: Eval run created with id { eval_run_id } ." +
164
161
" Results will be retrieved after normal evaluation is complete..." )
165
162
166
- return OAIEvalRunCreationInfo (eval_group_id = eval_group_info .id , eval_run_id = eval_run_id , grader_name_map = grader_name_map )
163
+ return OAIEvalRunCreationInfo (client = client , eval_group_id = eval_group_info .id , eval_run_id = eval_run_id , grader_name_map = grader_name_map )
167
164
168
165
def _get_evaluation_run_results (
169
166
client : Union [OpenAI , AzureOpenAI ],
@@ -174,8 +171,6 @@ def _get_evaluation_run_results(
174
171
pipeline to consume. This method accepts a list of eval run information, and will combine the
175
172
results into a single dataframe and metrics dictionary.
176
173
177
- :param client: The AOAI client to use for the evaluation.
178
- :type client: Union[OpenAI, AzureOpenAI]
179
174
:param all_run_info: A list of evaluation run information that contains the needed values
180
175
to retrieve the results of the evaluation run.
181
176
:type all_run_info: List[OAIEvalRunCreationInfo]
@@ -188,25 +183,19 @@ def _get_evaluation_run_results(
188
183
run_metrics = {}
189
184
output_df = pd .DataFrame ()
190
185
for run_info in all_run_info :
191
- cur_output_df , cur_run_metrics = _get_single_run_results (
192
- client ,
193
- run_info
194
- )
186
+ cur_output_df , cur_run_metrics = _get_single_run_results (run_info )
195
187
output_df = pd .concat ([output_df , cur_output_df ], axis = 1 )
196
188
run_metrics .update (cur_run_metrics )
197
189
198
190
return output_df , run_metrics
199
191
200
192
def _get_single_run_results (
201
- client : Union [OpenAI , AzureOpenAI ],
202
193
run_info : OAIEvalRunCreationInfo ,
203
194
) -> Tuple [pd .DataFrame , Dict [str , Any ]]:
204
195
"""
205
196
Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
206
197
pipeline to consume.
207
198
208
- :param client: The AOAI client to use for the evaluation.
209
- :type client: Union[OpenAI, AzureOpenAI]
210
199
:param run_info: The evaluation run information that contains the needed values
211
200
to retrieve the results of the evaluation run.
212
201
:type run_info: OAIEvalRunCreationInfo
@@ -216,7 +205,7 @@ def _get_single_run_results(
216
205
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
217
206
"""
218
207
# Wait for evaluation run to complete
219
- run_results = _wait_for_run_conclusion (client , run_info ["eval_group_id" ], run_info ["eval_run_id" ])
208
+ run_results = _wait_for_run_conclusion (run_info [ " client" ] , run_info ["eval_group_id" ], run_info ["eval_run_id" ])
220
209
if run_results .status != "completed" :
221
210
raise EvaluationException (
222
211
message = f"AOAI evaluation run { run_info ['eval_group_id' ]} /{ run_info ['eval_run_id' ]} "
@@ -248,7 +237,7 @@ def _get_single_run_results(
248
237
# The passed and score values are then added to the results dictionary, prepended with the grader's name
249
238
# as entered by the user in the inputted dictionary.
250
239
# Other values, if they exist, are also added to the results dictionary.
251
- raw_list_results = client .evals .runs .output_items .list (
240
+ raw_list_results = run_info [ " client" ] .evals .runs .output_items .list (
252
241
eval_id = run_info ["eval_group_id" ],
253
242
run_id = run_info ["eval_run_id" ]
254
243
)
@@ -273,41 +262,6 @@ def _get_single_run_results(
273
262
274
263
return output_df , run_metrics
275
264
276
- def _are_individual_runs_needed (
277
- graders : Dict [str , AzureOpenAIGrader ],
278
- column_mapping : Optional [Dict [str , str ]] = None
279
- ) -> bool :
280
- """
281
- Given an input set of graders and their column mapping, determine if
282
- the graders can be executed together under a single evaluation run,
283
- or if they must be handled individually.
284
-
285
- For simplicity's sake, the individual run condition is met if there are at least
286
- two graders, and if any of them have a unique column mapping.
287
-
288
- This is done to avoid the possibility of conflicting mappings, since OAI requires
289
- unique input name assignments to each evaluation group.
290
-
291
- :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
292
- :type graders: Dict[str, AoaiGrader]
293
- :param column_mapping: The column mapping to check.
294
- :type column_mapping: Optional[Dict[str, str]]
295
- :return: True if the graders require individual runs, False otherwise.
296
- :rtype: bool
297
- """
298
- if len (graders ) < 2 :
299
- # Only one grader, no need for individual runs.
300
- return False
301
- if column_mapping is None :
302
- # No column mapping provided, no need for individual runs.
303
- return False
304
- # Check if any of the graders have a unique column mapping.
305
- for name in graders .keys ():
306
- if name in column_mapping :
307
- # Grader with a unique column mapping found. Individual runs are needed.
308
- return True
309
- return False
310
-
311
265
312
266
def _convert_remote_eval_params_to_grader (grader_id : str , init_params : Dict [str , Any ]) -> AzureOpenAIGrader :
313
267
"""
@@ -380,6 +334,10 @@ def _get_graders_and_column_mappings(
380
334
the OAI API can't. So, if if there's a possibility that such a conflict might arise,
381
335
we need to split the incoming data up.
382
336
337
+ Currently splits each grader into its own eval group/run to ensure they each use
338
+ their own credentials later on. Planned fast follow is to group things by
339
+ matching credentials later.
340
+
383
341
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
384
342
:type graders: Dict[str, AoaiGrader]
385
343
:param column_mappings: The column mappings to use for the evaluation.
@@ -388,15 +346,9 @@ def _get_graders_and_column_mappings(
388
346
and the column mapping they should use.
389
347
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
390
348
"""
391
- if column_mappings is None :
392
- # No column mappings provided, no need to split.
393
- return [(graders , None )]
349
+
394
350
default_mapping = column_mappings .get ("default" , None )
395
- if not any (name in column_mappings for name in graders .keys ()):
396
- # No unique column mappings provided, no need to split.
397
- return [(graders , default_mapping )]
398
- # At least one grader has a unique column mapping, split graders up
399
- return [({name :grader }, column_mappings .get (name , default_mapping )) for name , grader in graders .items ()]
351
+ return [({name : grader }, column_mappings .get (name , default_mapping )) for name , grader in graders .items ()]
400
352
401
353
def _generate_data_source_config (input_data_df : pd .DataFrame , column_mapping : Dict [str , str ]) -> Dict [str , Any ]:
402
354
"""Produce a data source config that maps all columns from the supplied data source into
@@ -499,7 +451,6 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
499
451
}
500
452
}
501
453
502
-
503
454
def _begin_eval_run (
504
455
client : Union [OpenAI , AzureOpenAI ],
505
456
eval_group_id : str ,
0 commit comments