diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index e5403218dbd6..f56e535d0813 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -8,6 +8,8 @@ ### Breaking Changes ### Bugs Fixed +- Better handled edge case in `ToolCallAccuracyEvaluator` when there are no tool calls present in agent input. Previously evaluator throwed exception, which defaulted to a 'fail'. Now, a 'nan' score is assigned and a 'pass' assumed. + - Fixed error in `evaluate` where data fields could not contain numeric characters. Previously, a data file with schema: ``` "query1": "some query", "response: "some response" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 0b9a8a2b8da4..e62645c79d0a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -158,12 +158,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs): tool_calls.extend([content for content in message.get("content") if content.get("type") == "tool_call"]) if len(tool_calls) == 0: - raise EvaluationException( - message="response does not have tool calls. Either provide tool_calls or response with tool calls.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.MISSING_FIELD, - target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - ) + # return empty input when there are no tool calls. From a user perspective this is preferable to raising an exception + # as the user will see explicitly the evaluator did not run, rather than seeing a null + return [] if not isinstance(tool_calls, list): tool_calls = [tool_calls] @@ -260,11 +257,18 @@ def _aggregate_results(self, per_turn_results): # Go over each turn, and rotate the results into a # metric: List[values] format for the evals_per_turn dictionary. - score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results) - aggregated[self._AGGREGATE_RESULT_KEY] = score - aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail' - aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold + if len(per_turn_results) == 0: + aggregated[self._AGGREGATE_RESULT_KEY] = math.nan + # when there are no tool calls, we assume the evaluator 'passed' as there is nothing to evaluate + # assuming a failure could mislead the user into thinking there was a problem with the agent + # however, ideally we would like to have a third value like 'N/A' but only 'pass' or 'fail' are allowed for now + aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' + else: + score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results) + aggregated[self._AGGREGATE_RESULT_KEY] = score + aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail' + aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold aggregated["per_tool_call_details"] = per_turn_results return aggregated