RobotecAI
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎config.toml
+1 b/‎config.toml
+1
diff --git a/‎docs/tracing.md
+1 b/‎docs/tracing.md
+1
diff --git a/‎poetry.lock
+203-157 b/‎poetry.lock
+203-157
diff --git a/‎src/rai_bench/README.md
+27 b/‎src/rai_bench/README.md
+27
diff --git a/‎src/rai_bench/pyproject.toml
+2-1 b/‎src/rai_bench/pyproject.toml
+2-1
diff --git a/‎src/rai_bench/rai_bench/examples/tool_calling_agent_bench_tasks.py
+102 b/‎src/rai_bench/rai_bench/examples/tool_calling_agent_bench_tasks.py
+102
diff --git a/‎src/rai_bench/rai_bench/examples/tool_calling_agent_test_bench.py
+74 b/‎src/rai_bench/rai_bench/examples/tool_calling_agent_test_bench.py
+74
@@ -174,3 +174,5 @@ src/examples/*-demo
 artifact_database.pkl
 
 imgui.ini
+
+src/rai_bench/rai_bench/experiments
@@ -30,6 +30,7 @@ host = "http://localhost:3000"
 
 [tracing.langsmith]
 use_langsmith = false
+host = "https://api.smith.langchain.com"
 
 [asr]
 recording_device_name = "default"
 
@@ -31,6 +31,7 @@ To enable LangSmith tracing:
 
 1. Set `use_langsmith = true` in the `config.toml` file.
 2. Set the `LANGCHAIN_API_KEY` environment variable with your LangSmith API key.
+3. Optionally, you can specify a custom LangSmith host by modifying the `host` field under `[tracing.langsmith]`.
 
 ## Usage
 
 
@@ -102,3 +102,30 @@ When creating new task or changing existing ones, make sure to add unit tests fo
 This applies also when you are adding or changing the helper methods in `Task` or `ManipulationTask`.
 
 The number of scenarios can be easily extened without writing new tasks, by increasing number of variants of the same task and adding more simulation configs but it won't improve variety of scenarios as much as creating new tasks.
+
+### Tool Calling Agent Benchmark
+
+The Tool Calling Agent Benchmark is the benchmark for LangChain tool calling agents. It includes a set of tasks and a benchmark that evaluates the performance of the agent on those tasks by verifying the correctness of the tool calls requested by the agent. The benchmark is integrated with LangSmith and Langfuse tracing backends to easily track the performance of the agents.
+
+#### Frame Components
+
+- [Tool Calling Agent Benchmark](rai_bench/tool_calling_agent_bench/agent_bench.py) - Benchmark for LangChain tool calling agents
+- [Tasks Interfaces](rai_bench/tool_calling_agent_bench/agent_tasks_interfaces.py) - Interfaces for tool calling agent tasks
+- [Scores tracing](rai_bench/tool_calling_agent_bench/scores_tracing.py) - Component handling sending scores to tracing backends
+
+#### Benchmark Example with ROS2 Tools
+
+[tool_calling_agent_test_bench.py](rai_bench/examples/tool_calling_agent_test_bench.py) - Script providing benchmark on tasks based on the ROS2 tools usage.
+
+To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.
+
+To run the benchmark:
+
+```bash
+cd rai
+source setup_shell.sh
+python src/rai_bench/rai_bench/examples/tool_calling_agent_test_bench.py
+
+> [!NOTE]
+> The `simple_model` from [config.toml](../../config.toml) is currently set up in the example benchmark script. Change it to `complex_model` in the script if needed.
+```
@@ -2,14 +2,15 @@
 name = "rai-bench"
 version = "0.1.0"
 description = "Package for running and creating benchmarks."
-authors = ["jmatejcz <[email protected]>"]
+authors = ["jmatejcz <[email protected]>", "Magdalena Kotynia <[email protected]>"]
 readme = "README.md"
 
 packages = [
     { include = "rai_bench", from = "." },
 ]
 [tool.poetry.dependencies]
 python = "^3.10"
+inflect = "7.5.0"
 
 
 [build-system]
 
@@ -0,0 +1,102 @@
+# Copyright (C) 2025 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Sequence
+
+from rai_bench.tool_calling_agent_bench.agent_tasks_interfaces import (
+    ToolCallingAgentTask,
+)
+from rai_bench.tool_calling_agent_bench.ros2_agent_tasks import (
+    GetAllROS2RGBCamerasTask,
+    GetObjectPositionsTask,
+    GetROS2DepthCameraTask,
+    GetROS2MessageTask,
+    GetROS2RGBCameraTask,
+    GetROS2TopicsTask,
+    GetROS2TopicsTask2,
+    GrabExistingObjectTask,
+    GrabNotExistingObjectTask,
+    MoveExistingObjectFrontTask,
+    MoveExistingObjectLeftTask,
+    MoveToPointTask,
+    SwapObjectsTask,
+)
+
+tasks: Sequence[ToolCallingAgentTask] = [
+    GetROS2RGBCameraTask(),
+    GetROS2TopicsTask(),
+    GetROS2DepthCameraTask(),
+    GetAllROS2RGBCamerasTask(),
+    GetROS2TopicsTask2(),
+    GetROS2MessageTask(),
+    MoveToPointTask(args={"x": 1.0, "y": 2.0, "z": 3.0, "task": "grab"}),
+    MoveToPointTask(args={"x": 1.2, "y": 2.3, "z": 3.4, "task": "drop"}),
+    GetObjectPositionsTask(
+        objects={
+            "carrot": [{"x": 1.0, "y": 2.0, "z": 3.0}],
+            "apple": [{"x": 4.0, "y": 5.0, "z": 6.0}],
+            "banana": [
+                {"x": 7.0, "y": 8.0, "z": 9.0},
+                {"x": 10.0, "y": 11.0, "z": 12.0},
+            ],
+        },
+    ),
+    GrabExistingObjectTask(
+        object_to_grab="banana",
+        objects={
+            "banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
+            "apple": [
+                {"x": 4.0, "y": 5.0, "z": 6.0},
+                {"x": 10.0, "y": 11.0, "z": 12.0},
+            ],
+        },
+    ),
+    GrabNotExistingObjectTask(
+        object_to_grab="apple",
+        objects={
+            "banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
+            "cube": [
+                {"x": 4.0, "y": 5.0, "z": 6.0},
+                {"x": 10.0, "y": 11.0, "z": 12.0},
+            ],
+        },
+    ),
+    MoveExistingObjectLeftTask(
+        object_to_grab="banana",
+        objects={
+            "banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
+            "apple": [
+                {"x": 4.0, "y": 5.0, "z": 6.0},
+                {"x": 10.0, "y": 11.0, "z": 12.0},
+            ],
+        },
+    ),
+    MoveExistingObjectFrontTask(
+        object_to_grab="banana",
+        objects={
+            "banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
+            "apple": [
+                {"x": 4.0, "y": 5.0, "z": 6.0},
+                {"x": 10.0, "y": 11.0, "z": 12.0},
+            ],
+        },
+    ),
+    SwapObjectsTask(
+        objects={
+            "banana": [{"x": 1.0, "y": 2.0, "z": 3.0}],
+            "apple": [{"x": 4.0, "y": 5.0, "z": 6.0}],
+        },
+        objects_to_swap=["banana", "apple"],
+    ),
+]
@@ -0,0 +1,74 @@
+# Copyright (C) 2025 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from datetime import datetime
+from pathlib import Path
+
+from rai.agents.conversational_agent import create_conversational_agent
+from rai.utils.model_initialization import (
+    get_llm_model,
+    get_llm_model_config_and_vendor,
+)
+
+from rai_bench.examples.tool_calling_agent_bench_tasks import tasks
+from rai_bench.tool_calling_agent_bench.agent_bench import ToolCallingAgentBenchmark
+
+if __name__ == "__main__":
+    current_test_name = Path(__file__).stem
+
+    now = datetime.now()
+    experiment_dir = (
+        Path("src/rai_bench/rai_bench/experiments")
+        / current_test_name
+        / now.strftime("%Y-%m-%d_%H-%M-%S")
+    )
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    log_filename = experiment_dir / "benchmark.log"
+    results_filename = experiment_dir / "results.csv"
+
+    file_handler = logging.FileHandler(log_filename)
+    file_handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+
+    bench_logger = logging.getLogger("Benchmark logger")
+    bench_logger.setLevel(logging.INFO)
+    bench_logger.addHandler(file_handler)
+
+    agent_logger = logging.getLogger("Agent logger")
+    agent_logger.setLevel(logging.INFO)
+    agent_logger.addHandler(file_handler)
+
+    for task in tasks:
+        task.logger = bench_logger
+
+    benchmark = ToolCallingAgentBenchmark(
+        tasks=tasks, logger=bench_logger, results_filename=results_filename
+    )
+
+    model_type = "simple_model"
+    model_config = get_llm_model_config_and_vendor(model_type=model_type)[0]
+    model_name = getattr(model_config, model_type)
+
+    for task in tasks:
+        agent = create_conversational_agent(
+            llm=get_llm_model(model_type=model_type),
+            tools=task.expected_tools,
+            system_prompt=task.get_system_prompt(),
+            logger=agent_logger,
+        )
+        benchmark.run_next(agent=agent, model_name=model_name)