Skip to content

Simplified Tool Call Accuracy V1 #40710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# --------------------------------------------
from typing import Any, Dict, List, Optional, Union
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from ._tool_calls_predictor import predict_tools

def _extract_tool_items(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
items = []
for msg in messages:
for c in msg.get("content", []):
if c.get("type") == "tool_call":
items.append({
"type": "tool_call",
"name": c.get("name"),
"arguments": c.get("arguments"),
})
elif c.get("type") == "tool_result":
items.append({
"type": "tool_result",
"tool_result": c.get("tool_result"),
})
return items


def _compare_values(val1, val2) -> float:
# Helper: recursively, for dict, list, primitives
if isinstance(val1, dict) and isinstance(val2, dict):
keys = set(val1) | set(val2)
if not keys:
return 1.0
scores = []
for k in keys:
if k in val1 and k in val2:
scores.append(_compare_values(val1[k], val2[k]))
else:
scores.append(0.0)
return sum(scores) / len(scores)
elif isinstance(val1, list) and isinstance(val2, list):
if not val1 and not val2:
return 1.0
matches = []
min_len = min(len(val1), len(val2))
for i in range(min_len):
matches.append(_compare_values(val1[i], val2[i]))
# Penalty for missing values
matches += [0.0] * (abs(len(val1) - len(val2)))
return sum(matches) / max(len(val1), len(val2), 1)
else:
# Compare scalars as strings with normalization
if str(val1).lower() == str(val2).lower():
return 1.0
# Otherwise consider similar strings (could add fuzzy matching here if wanted)
return 0.0


def _match_tool_call(expected: Dict, predicted: Dict) -> float:
if expected["type"] != predicted["type"]:
return 0.0
if expected["type"] == "tool_call":
name_match = expected.get("name") == predicted.get("name")
args_expected = expected.get("arguments", {})
args_pred = predicted.get("arguments", {})
if name_match:
if args_expected == args_pred:
return 1.0
# Partial credit: overlap in arguments
arg_score = _compare_values(args_expected, args_pred)
return 0.5 + 0.5 * arg_score if arg_score > 0 else 0.5
else:
return 0.0
elif expected["type"] == "tool_result":
res_expected = expected.get("tool_result", {})
res_pred = predicted.get("tool_result", {})
score = _compare_values(res_expected, res_pred)
if score == 1.0:
return 1.0
elif score > 0:
return 0.5 + 0.5 * score
else:
return 0.5 if res_pred else 0.0
return 0.0


def _evaluate_tool_accuracy(expected: List[Dict], predicted: List[Dict]) -> (float, List[str]):
reasons = []
matched_pred_indices = set()
per_item_scores = []

for idx, exp in enumerate(expected):
max_score = 0.0
max_j = None
for j, pred in enumerate(predicted):
if j in matched_pred_indices:
continue
score = _match_tool_call(exp, pred)
if score > max_score:
max_score = score
max_j = j
per_item_scores.append(max_score)
if max_score == 1.0:
continue
elif max_score >= 0.75:
reasons.append(f"Mostly correct for {exp['type']} (minor data mismatch).")
elif max_score >= 0.5:
reasons.append(f"Partial match for {exp['type']} (key fields mismatch or missing).")
else:
reasons.append(f"Missing or incorrect {exp['type']}.")
if max_j is not None:
matched_pred_indices.add(max_j)
accuracy = sum(per_item_scores) / max(len(expected), 1)
return accuracy, reasons


class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
_PROMPTY_FILE = None
_RESULT_KEY = "tool_call_accuracy"
_DEFAULT_THRESHOLD = 0.8

def __init__(self, model_config, *, threshold: float = _DEFAULT_THRESHOLD):
self.threshold = threshold
super().__init__(
model_config=model_config,
prompty_file=None,
result_key=self._RESULT_KEY
)

def __call__(
self,
query: List[Dict[str, Any]],
response: List[Dict[str, Any]],
tool_definitions: List[Dict[str, Any]],
ground_truth: Optional[List[Dict[str, Any]]] = None,
**kwargs
) -> Dict[str, Any]:
eval_input = {
"query": query,
"response": response,
"tool_definitions": tool_definitions,
"ground_truth": ground_truth
}
return super().__call__(eval_input, **kwargs)

async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
query = eval_input.get("query")
response = eval_input.get("response")
tool_definitions = eval_input.get("tool_definitions")
ground_truth = eval_input.get("ground_truth")

if ground_truth is None:
ground_truth = generate_ground_truth(query, response, tool_definitions)

gt_items = ground_truth
pred_items = _extract_tool_items(response)

accuracy, reasons = _evaluate_tool_accuracy(gt_items, pred_items)
status = "pass" if accuracy >= self.threshold else "fail"

if not reasons:
reason_str = "All tool calls and results matched the ground truth."
else:
if accuracy == 0.0:
reason_str = "No correct tool calls or results detected."
else:
reason_str = "; ".join(reasons)

result = {
"tool_call_accuracy": status,
"tool_call_accuracy_score": round(accuracy, 3),
"tool_call_threshold": self.threshold,
"tool_call_accuracy_reason": reason_str
}
return result
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._tool_call_predictor import predict_tools

__all__ = [
"predict_tools",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

import math
import os
import re
from typing import Dict, TypeVar, Union

from azure.ai.evaluation._legacy.prompty import AsyncPrompty
from typing_extensions import override

from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
from ...._common.utils import construct_prompty_model_config, validate_model_config, \
parse_quality_evaluator_reason_score

try:
from ..._user_agent import USER_AGENT
except ImportError:
USER_AGENT = "None"

T = TypeVar("T")

#
from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop

from promptflow.core import AsyncPrompty

from ...._common.utils import construct_prompty_model_config, validate_model_config, \
parse_quality_evaluator_reason_score

try:
from ..._user_agent import USER_AGENT
except ImportError:
USER_AGENT = "None"


def predict_tools(model_config, query: Union[str, list], tool_definition: list):
"""Generate ground truth for the given query and tool definition.

:param query: The input query or a list of queries.
:type query: Union[str, list]
:param tool_definition: The tool definition to use for generating ground truth.
:type tool_definition: list
:return: The generated ground truth.
:rtype: dict
"""
_PROMPTY_FILE = "tool_call_predictor.prompty"
current_dir = os.path.dirname(__file__)
prompty_path = os.path.join(current_dir, _PROMPTY_FILE)
_LLM_CALL_TIMEOUT = 600
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"

user_agent = f"{USER_AGENT} (type=evaluator subtype=ground_truth_generator)" # type: ignore[assignment]
prompty_model_config = construct_prompty_model_config(
validate_model_config(model_config),
_DEFAULT_OPEN_API_VERSION,
user_agent,
)

flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
eval_input = {
"query": query,
"tool_definition": tool_definition,
}
llm_output = async_run_allowing_running_loop(flow, **{"timeout": _LLM_CALL_TIMEOUT, **eval_input})
if isinstance(llm_output, dict):
return llm_output
return {"tools": []}
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
---
name: Ground Truth Generator
description: Generates ground truth for Tool Call Accuracy
model:
api: chat
parameters:
temperature: 0.0
max_tokens: 800
top_p: 1.0
presence_penalty: 0
frequency_penalty: 0
response_format:
type: json_object
inputs:
query:
type: List
tool_definition:
type: Dict


---
system:
# Instruction

You are an expert in analyzing user queries and determining the appropriate tool calls from a set of available tools listed in `tool_definitions`. Your goal is to identify the most relevant tool(s) to resolve the user's request. In some cases, multiple tools may need to be used in sequence, where one tool’s output serves as input to another.

**Instructions:**

1. **Think step by step.**
2. Carefully analyze the user query to extract all available parameters.
3. For each tool in `tool_definitions`, examine:
- What the tool does.
- What parameters it requires.
- Whether each required parameter is:
- Directly available from the user query, or
- Can be obtained by calling another tool first, out of the available tools in tool_definitions.
4. Only select a tool if:
- It directly resolves the user’s request, or
- It is a necessary prerequisite to another tool that does.
5. Do NOT select a tool if its required parameters are not available directly in the query or cannot be obtained via another available tool call. DO NOT ASSUME any external knowledge or availability of any other tool beyond the provided query and tool_definitions.

For every tool you choose to use, provide:
- A clear reasoning for why the tool is relevant to the query.
- An analysis of each parameter the tool requires, and whether it is available directly from the query or needs to be obtained from another available tool.
- An explanation of how the tool contributes to resolving the overall user query.

Your goal is to build a minimal and effective chain of tool calls to fulfill the user's request completely and accurately.

# Data:
Query: {{query}}
Tool Definitions: {{tool_definition}}

# Examples:

**Example 1:**
**query: ** Where exactly is 221B Baker Street, London?
**tool_definitions: ** '[{"name":"azure_maps_weather_hourly_forecast","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"},"duration":{"type":"string","description":"No description"}}}},{"name":"azure_maps_geolocation","description":"No description","parameters":{"type":"object","properties":{"ip_address":{"type":"string","description":"No description"}}}},{"name":"azure_maps_weather_current_conditions","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"}}}},{"name":"azure_maps_search","description":"No description","parameters":{"type":"object","properties":{"address":{"type":"string","description":"No description"}}}}]'

**Expected output**
{
"thought_chain": "Let's think step by step: The user wants to know the exact location of '221B Baker Street, London'. This is an address-based query. To resolve this, we need a tool that can convert the textual address into geographical coordinates or a precise map location. Among the available tools, only 'azure_maps_search' accepts an address as input. The other tools require latitude and longitude, or IP address, which are not available directly in the user query. Therefore, only 'azure_maps_search' is relevant here.",
"tool_calls": [
{
"tool_name": "azure_maps_search",
"parameters": {
"address": "221B Baker Street, London"
},
"reasoning": "'azure_maps_search' is the appropriate tool because it accepts a textual address as input and returns location information. The user has provided a complete address in their query, so the required parameter 'address' is directly available."
}
]
}

**Example 2:**
**query: ** What is the weather like in New York City?
**tool_definitions: ** '[{"name":"azure_maps_weather_hourly_forecast","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"},"duration":{"type":"string","description":"No description"}}}},{"name":"azure_maps_geolocation","description":"No description","parameters":{"type":"object","properties":{"ip_address":{"type":"string","description":"No description"}}}},{"name":"azure_maps_weather_current_conditions","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"}}}},{"name":"azure_maps_search","description":"No description","parameters":{"type":"object","properties":{"address":{"type":"string","description":"No description"}}}}]'

**Expected output**
{
"thought_chain": "Let's think step by step: The user is asking about the current weather in New York City. To answer this, we need the current weather conditions for that location. The tool 'azure_maps_weather_current_conditions' provides current weather but requires latitude and longitude as input. The user query provides a city name, not coordinates. To get the coordinates, we can use the 'azure_maps_search' tool, which accepts an address (in this case, 'New York City') and returns location data including lat and lon. So, we must first call 'azure_maps_search' with the city name, then use the returned lat/lon in 'azure_maps_weather_current_conditions'.",
"tool_calls": [
{
"tool_name": "azure_maps_search",
"parameters": {
"address": "New York City"
},
"reasoning": "'azure_maps_search' is necessary because it can convert the address 'New York City' into geographic coordinates (latitude and longitude), which are required for retrieving weather data."
},
{
"tool_name": "azure_maps_weather_current_conditions",
"parameters": {
"lat": "<to be filled from azure_maps_search result>",
"lon": "<to be filled from azure_maps_search result>"
},
"reasoning": "'azure_maps_weather_current_conditions' provides the current weather information, which directly answers the user's query. It requires 'lat' and 'lon', which we will obtain from the previous 'azure_maps_search' call."
}
]
}




# Task
## Please generate a list of tool calls that are relevant to the user QUERY given the set of TOOL_DEFINTIONS. Follow the INSTRUCTIONS strictly. DO NOT ASSUME any external knowledge or availability of any other tool beyond the provided query and tool_definitions. If multiple tool calls are needed, list them in the order they should be called. Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
- **thought_chain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
- **tool_calls**: A list of tool calls that are relevant to the user query. Each tool call should include the following information:
- **tool_name**: The name of the tool.
- **parameters**: The parameters used in the tool call, including their values.
- **reasoning**: A brief explanation of why this tool call, out of the available tools, is relevant to the user query.

# Output