Azure · JoseCSantos · Mar 28, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -8,6 +8,8 @@
 ### Breaking Changes
 
 ### Bugs Fixed
+- Better handled edge case in `ToolCallAccuracyEvaluator` when there are no tool calls present in agent input. Previously evaluator throwed exception, which defaulted to a 'fail'. Now, a 'nan' score is assigned and a 'pass' assumed.
+
 - Fixed error in `evaluate` where data fields could not contain numeric characters. Previously, a data file with schema:
     ```
     "query1": "some query", "response: "some response"

@@ -158,12 +158,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
                         tool_calls.extend([content for content in message.get("content")
                                         if content.get("type") == "tool_call"])
             if len(tool_calls) == 0:
-                raise EvaluationException(
-                    message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
-                    blame=ErrorBlame.USER_ERROR,
-                    category=ErrorCategory.MISSING_FIELD,
-                    target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-                )
+                # return empty input when there are no tool calls. From a user perspective this is preferable to raising an exception 
+                # as the user will see explicitly the evaluator did not run, rather than seeing a null
+                return []
 
         if not isinstance(tool_calls, list):
             tool_calls = [tool_calls]
@@ -260,11 +257,18 @@ def _aggregate_results(self, per_turn_results):
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
 
-        score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
-        aggregated[self._AGGREGATE_RESULT_KEY] = score
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
+        if len(per_turn_results) == 0:
+            aggregated[self._AGGREGATE_RESULT_KEY] = math.nan
+            # when there are no tool calls, we assume the evaluator 'passed' as there is nothing to evaluate
+            # assuming a failure could mislead the user into thinking there was a problem with the agent
+            # however, ideally we would like to have a third value like 'N/A' but only 'pass' or 'fail' are allowed for now
+            aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass'
+        else:
+            score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
+            aggregated[self._AGGREGATE_RESULT_KEY] = score
+            aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
 
+        aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
         aggregated["per_tool_call_details"] = per_turn_results
         return aggregated