Azure · ghyadav · Apr 24, 2025
@@ -0,0 +1,173 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# --------------------------------------------
+from typing import Any, Dict, List, Optional, Union
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from ._tool_calls_predictor import predict_tools
+
+def _extract_tool_items(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    items = []
+    for msg in messages:
+        for c in msg.get("content", []):
+            if c.get("type") == "tool_call":
+                items.append({
+                    "type": "tool_call",
+                    "name": c.get("name"),
+                    "arguments": c.get("arguments"),
+                })
+            elif c.get("type") == "tool_result":
+                items.append({
+                    "type": "tool_result",
+                    "tool_result": c.get("tool_result"),
+                })
+    return items
+
+
+def _compare_values(val1, val2) -> float:
+    # Helper: recursively, for dict, list, primitives
+    if isinstance(val1, dict) and isinstance(val2, dict):
+        keys = set(val1) | set(val2)
+        if not keys:
+            return 1.0
+        scores = []
+        for k in keys:
+            if k in val1 and k in val2:
+                scores.append(_compare_values(val1[k], val2[k]))
+            else:
+                scores.append(0.0)
+        return sum(scores) / len(scores)
+    elif isinstance(val1, list) and isinstance(val2, list):
+        if not val1 and not val2:
+            return 1.0
+        matches = []
+        min_len = min(len(val1), len(val2))
+        for i in range(min_len):
+            matches.append(_compare_values(val1[i], val2[i]))
+            # Penalty for missing values
+        matches += [0.0] * (abs(len(val1) - len(val2)))
+        return sum(matches) / max(len(val1), len(val2), 1)
+    else:
+        # Compare scalars as strings with normalization
+        if str(val1).lower() == str(val2).lower():
+            return 1.0
+            # Otherwise consider similar strings (could add fuzzy matching here if wanted)
+        return 0.0
+
+
+def _match_tool_call(expected: Dict, predicted: Dict) -> float:
+    if expected["type"] != predicted["type"]:
+        return 0.0
+    if expected["type"] == "tool_call":
+        name_match = expected.get("name") == predicted.get("name")
+        args_expected = expected.get("arguments", {})
+        args_pred = predicted.get("arguments", {})
+        if name_match:
+            if args_expected == args_pred:
+                return 1.0
+                # Partial credit: overlap in arguments
+            arg_score = _compare_values(args_expected, args_pred)
+            return 0.5 + 0.5 * arg_score if arg_score > 0 else 0.5
+        else:
+            return 0.0
+    elif expected["type"] == "tool_result":
+        res_expected = expected.get("tool_result", {})
+        res_pred = predicted.get("tool_result", {})
+        score = _compare_values(res_expected, res_pred)
+        if score == 1.0:
+            return 1.0
+        elif score > 0:
+            return 0.5 + 0.5 * score
+        else:
+            return 0.5 if res_pred else 0.0
+    return 0.0
+
+
+def _evaluate_tool_accuracy(expected: List[Dict], predicted: List[Dict]) -> (float, List[str]):
+    reasons = []
+    matched_pred_indices = set()
+    per_item_scores = []
+
+    for idx, exp in enumerate(expected):
+        max_score = 0.0
+        max_j = None
+        for j, pred in enumerate(predicted):
+            if j in matched_pred_indices:
+                continue
+            score = _match_tool_call(exp, pred)
+            if score > max_score:
+                max_score = score
+                max_j = j
+        per_item_scores.append(max_score)
+        if max_score == 1.0:
+            continue
+        elif max_score >= 0.75:
+            reasons.append(f"Mostly correct for {exp['type']} (minor data mismatch).")
+        elif max_score >= 0.5:
+            reasons.append(f"Partial match for {exp['type']} (key fields mismatch or missing).")
+        else:
+            reasons.append(f"Missing or incorrect {exp['type']}.")
+        if max_j is not None:
+            matched_pred_indices.add(max_j)
+    accuracy = sum(per_item_scores) / max(len(expected), 1)
+    return accuracy, reasons
+
+
+class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    _PROMPTY_FILE = None
+    _RESULT_KEY = "tool_call_accuracy"
+    _DEFAULT_THRESHOLD = 0.8
+
+    def __init__(self, model_config, *, threshold: float = _DEFAULT_THRESHOLD):
+        self.threshold = threshold
+        super().__init__(
+            model_config=model_config,
+            prompty_file=None,
+            result_key=self._RESULT_KEY
+        )
+
+    def __call__(
+            self,
+            query: List[Dict[str, Any]],
+            response: List[Dict[str, Any]],
+            tool_definitions: List[Dict[str, Any]],
+            ground_truth: Optional[List[Dict[str, Any]]] = None,
+            **kwargs
+    ) -> Dict[str, Any]:
+        eval_input = {
+            "query": query,
+            "response": response,
+            "tool_definitions": tool_definitions,
+            "ground_truth": ground_truth
+        }
+        return super().__call__(eval_input, **kwargs)
+
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
+        query = eval_input.get("query")
+        response = eval_input.get("response")
+        tool_definitions = eval_input.get("tool_definitions")
+        ground_truth = eval_input.get("ground_truth")
+
+        if ground_truth is None:
+            ground_truth = generate_ground_truth(query, response, tool_definitions)
+
+        gt_items = ground_truth
+        pred_items = _extract_tool_items(response)
+
+        accuracy, reasons = _evaluate_tool_accuracy(gt_items, pred_items)
+        status = "pass" if accuracy >= self.threshold else "fail"
+
+        if not reasons:
+            reason_str = "All tool calls and results matched the ground truth."
+        else:
+            if accuracy == 0.0:
+                reason_str = "No correct tool calls or results detected."
+            else:
+                reason_str = "; ".join(reasons)
+
+        result = {
+            "tool_call_accuracy": status,
+            "tool_call_accuracy_score": round(accuracy, 3),
+            "tool_call_threshold": self.threshold,
+            "tool_call_accuracy_reason": reason_str
+        }
+        return result  
@@ -0,0 +1,9 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+from ._tool_call_predictor import predict_tools
+
+__all__ = [
+    "predict_tools",
+]
@@ -0,0 +1,71 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+import math
+import os
+import re
+from typing import Dict, TypeVar, Union
+
+from azure.ai.evaluation._legacy.prompty import AsyncPrompty
+from typing_extensions import override
+
+from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ...._common.utils import construct_prompty_model_config, validate_model_config, \
+    parse_quality_evaluator_reason_score
+
+try:
+    from ..._user_agent import USER_AGENT
+except ImportError:
+    USER_AGENT = "None"
+
+T = TypeVar("T")
+
+#
+from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
+
+from promptflow.core import AsyncPrompty
+
+from ...._common.utils import construct_prompty_model_config, validate_model_config, \
+    parse_quality_evaluator_reason_score
+
+try:
+    from ..._user_agent import USER_AGENT
+except ImportError:
+    USER_AGENT = "None"
+
+
+def predict_tools(model_config, query: Union[str, list], tool_definition: list):
+    """Generate ground truth for the given query and tool definition.
+
+    :param query: The input query or a list of queries.
+    :type query: Union[str, list]
+    :param tool_definition: The tool definition to use for generating ground truth.
+    :type tool_definition: list
+    :return: The generated ground truth.
+    :rtype: dict
+    """
+    _PROMPTY_FILE = "tool_call_predictor.prompty"
+    current_dir = os.path.dirname(__file__)
+    prompty_path = os.path.join(current_dir, _PROMPTY_FILE)
+    _LLM_CALL_TIMEOUT = 600
+    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+
+    user_agent = f"{USER_AGENT} (type=evaluator subtype=ground_truth_generator)"  # type: ignore[assignment]
+    prompty_model_config = construct_prompty_model_config(
+        validate_model_config(model_config),
+        _DEFAULT_OPEN_API_VERSION,
+        user_agent,
+    )
+
+    flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
+    eval_input = {
+        "query": query,
+        "tool_definition": tool_definition,
+    }
+    llm_output = async_run_allowing_running_loop(flow, **{"timeout": _LLM_CALL_TIMEOUT, **eval_input})
+    if isinstance(llm_output, dict):
+        return llm_output
+    return {"tools": []}
@@ -0,0 +1,110 @@
+---
+name: Ground Truth Generator
+description: Generates ground truth for Tool Call Accuracy
+model:
+  api: chat
+  parameters:
+    temperature: 0.0
+    max_tokens: 800
+    top_p: 1.0
+    presence_penalty: 0
+    frequency_penalty: 0
+    response_format:
+      type: json_object
+inputs:
+    query:
+        type: List
+    tool_definition:
+        type: Dict
+
+
+---
+system:
+# Instruction
+
+You are an expert in analyzing user queries and determining the appropriate tool calls from a set of available tools listed in `tool_definitions`. Your goal is to identify the most relevant tool(s) to resolve the user's request. In some cases, multiple tools may need to be used in sequence, where one tool’s output serves as input to another.
+
+**Instructions:**
+
+1. **Think step by step.**
+2. Carefully analyze the user query to extract all available parameters.
+3. For each tool in `tool_definitions`, examine:
+   - What the tool does.
+   - What parameters it requires.
+   - Whether each required parameter is:
+     - Directly available from the user query, or
+     - Can be obtained by calling another tool first, out of the available tools in tool_definitions.
+4. Only select a tool if:
+   - It directly resolves the user’s request, or
+   - It is a necessary prerequisite to another tool that does.
+5. Do NOT select a tool if its required parameters are not available directly in the query or cannot be obtained via another available tool call. DO NOT ASSUME any external knowledge or availability of any other tool beyond the provided query and tool_definitions.
+
+For every tool you choose to use, provide:
+- A clear reasoning for why the tool is relevant to the query.
+- An analysis of each parameter the tool requires, and whether it is available directly from the query or needs to be obtained from another available tool.
+- An explanation of how the tool contributes to resolving the overall user query.
+
+Your goal is to build a minimal and effective chain of tool calls to fulfill the user's request completely and accurately.
+
+# Data:
+Query: {{query}}
+Tool Definitions: {{tool_definition}}
+
+# Examples:
+
+**Example 1:**
+    **query: ** Where exactly is 221B Baker Street, London?
+    **tool_definitions: ** '[{"name":"azure_maps_weather_hourly_forecast","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"},"duration":{"type":"string","description":"No description"}}}},{"name":"azure_maps_geolocation","description":"No description","parameters":{"type":"object","properties":{"ip_address":{"type":"string","description":"No description"}}}},{"name":"azure_maps_weather_current_conditions","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"}}}},{"name":"azure_maps_search","description":"No description","parameters":{"type":"object","properties":{"address":{"type":"string","description":"No description"}}}}]'
+
+    **Expected output**
+    {
+      "thought_chain": "Let's think step by step: The user wants to know the exact location of '221B Baker Street, London'. This is an address-based query. To resolve this, we need a tool that can convert the textual address into geographical coordinates or a precise map location. Among the available tools, only 'azure_maps_search' accepts an address as input. The other tools require latitude and longitude, or IP address, which are not available directly in the user query. Therefore, only 'azure_maps_search' is relevant here.",
+      "tool_calls": [
+        {
+          "tool_name": "azure_maps_search",
+          "parameters": {
+            "address": "221B Baker Street, London"
+          },
+          "reasoning": "'azure_maps_search' is the appropriate tool because it accepts a textual address as input and returns location information. The user has provided a complete address in their query, so the required parameter 'address' is directly available."
+        }
+      ]
+    }
+
+**Example 2:**
+    **query: ** What is the weather like in New York City?
+    **tool_definitions: ** '[{"name":"azure_maps_weather_hourly_forecast","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"},"duration":{"type":"string","description":"No description"}}}},{"name":"azure_maps_geolocation","description":"No description","parameters":{"type":"object","properties":{"ip_address":{"type":"string","description":"No description"}}}},{"name":"azure_maps_weather_current_conditions","description":"No description","parameters":{"type":"object","properties":{"lat":{"type":"string","description":"No description"},"lon":{"type":"string","description":"No description"}}}},{"name":"azure_maps_search","description":"No description","parameters":{"type":"object","properties":{"address":{"type":"string","description":"No description"}}}}]'
+
+    **Expected output**
+    {
+      "thought_chain": "Let's think step by step: The user is asking about the current weather in New York City. To answer this, we need the current weather conditions for that location. The tool 'azure_maps_weather_current_conditions' provides current weather but requires latitude and longitude as input. The user query provides a city name, not coordinates. To get the coordinates, we can use the 'azure_maps_search' tool, which accepts an address (in this case, 'New York City') and returns location data including lat and lon. So, we must first call 'azure_maps_search' with the city name, then use the returned lat/lon in 'azure_maps_weather_current_conditions'.",
+      "tool_calls": [
+        {
+          "tool_name": "azure_maps_search",
+          "parameters": {
+            "address": "New York City"
+          },
+          "reasoning": "'azure_maps_search' is necessary because it can convert the address 'New York City' into geographic coordinates (latitude and longitude), which are required for retrieving weather data."
+        },
+        {
+          "tool_name": "azure_maps_weather_current_conditions",
+          "parameters": {
+            "lat": "<to be filled from azure_maps_search result>",
+            "lon": "<to be filled from azure_maps_search result>"
+          },
+          "reasoning": "'azure_maps_weather_current_conditions' provides the current weather information, which directly answers the user's query. It requires 'lat' and 'lon', which we will obtain from the previous 'azure_maps_search' call."
+        }
+      ]
+    }
+
+
+
+
+# Task
+## Please generate a list of tool calls that are relevant to the user QUERY given the set of TOOL_DEFINTIONS. Follow the INSTRUCTIONS strictly. DO NOT ASSUME any external knowledge or availability of any other tool beyond the provided query and tool_definitions. If multiple tool calls are needed, list them in the order they should be called. Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
+- **thought_chain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
+- **tool_calls**: A list of tool calls that are relevant to the user query. Each tool call should include the following information:
+  - **tool_name**: The name of the tool.
+  - **parameters**: The parameters used in the tool call, including their values.
+  - **reasoning**: A brief explanation of why this tool call, out of the available tools, is relevant to the user query.
+
+# Output