Skip to content

Commit 8954cd7

Browse files
feat: tool calling benchmark (#455)
Co-authored-by: Bartłomiej Boczek <[email protected]>
1 parent 9b47158 commit 8954cd7

14 files changed

+2701
-162
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,5 @@ src/examples/*-demo
174174
artifact_database.pkl
175175

176176
imgui.ini
177+
178+
src/rai_bench/rai_bench/experiments

config.toml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ host = "http://localhost:3000"
3030

3131
[tracing.langsmith]
3232
use_langsmith = false
33+
host = "https://api.smith.langchain.com"
3334

3435
[asr]
3536
recording_device_name = "default"

docs/tracing.md

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ To enable LangSmith tracing:
3131

3232
1. Set `use_langsmith = true` in the `config.toml` file.
3333
2. Set the `LANGCHAIN_API_KEY` environment variable with your LangSmith API key.
34+
3. Optionally, you can specify a custom LangSmith host by modifying the `host` field under `[tracing.langsmith]`.
3435

3536
## Usage
3637

poetry.lock

+203-157
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/rai_bench/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,30 @@ When creating new task or changing existing ones, make sure to add unit tests fo
102102
This applies also when you are adding or changing the helper methods in `Task` or `ManipulationTask`.
103103

104104
The number of scenarios can be easily extened without writing new tasks, by increasing number of variants of the same task and adding more simulation configs but it won't improve variety of scenarios as much as creating new tasks.
105+
106+
### Tool Calling Agent Benchmark
107+
108+
The Tool Calling Agent Benchmark is the benchmark for LangChain tool calling agents. It includes a set of tasks and a benchmark that evaluates the performance of the agent on those tasks by verifying the correctness of the tool calls requested by the agent. The benchmark is integrated with LangSmith and Langfuse tracing backends to easily track the performance of the agents.
109+
110+
#### Frame Components
111+
112+
- [Tool Calling Agent Benchmark](rai_bench/tool_calling_agent_bench/agent_bench.py) - Benchmark for LangChain tool calling agents
113+
- [Tasks Interfaces](rai_bench/tool_calling_agent_bench/agent_tasks_interfaces.py) - Interfaces for tool calling agent tasks
114+
- [Scores tracing](rai_bench/tool_calling_agent_bench/scores_tracing.py) - Component handling sending scores to tracing backends
115+
116+
#### Benchmark Example with ROS2 Tools
117+
118+
[tool_calling_agent_test_bench.py](rai_bench/examples/tool_calling_agent_test_bench.py) - Script providing benchmark on tasks based on the ROS2 tools usage.
119+
120+
To set up tracing backends, please follow the instructions in the [tracing.md](../../docs/tracing.md) document.
121+
122+
To run the benchmark:
123+
124+
```bash
125+
cd rai
126+
source setup_shell.sh
127+
python src/rai_bench/rai_bench/examples/tool_calling_agent_test_bench.py
128+
129+
> [!NOTE]
130+
> The `simple_model` from [config.toml](../../config.toml) is currently set up in the example benchmark script. Change it to `complex_model` in the script if needed.
131+
```

src/rai_bench/pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
name = "rai-bench"
33
version = "0.1.0"
44
description = "Package for running and creating benchmarks."
5-
authors = ["jmatejcz <[email protected]>"]
5+
authors = ["jmatejcz <[email protected]>", "Magdalena Kotynia <[email protected]>"]
66
readme = "README.md"
77

88
packages = [
99
{ include = "rai_bench", from = "." },
1010
]
1111
[tool.poetry.dependencies]
1212
python = "^3.10"
13+
inflect = "7.5.0"
1314

1415

1516
[build-system]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Copyright (C) 2025 Robotec.AI
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import Sequence
16+
17+
from rai_bench.tool_calling_agent_bench.agent_tasks_interfaces import (
18+
ToolCallingAgentTask,
19+
)
20+
from rai_bench.tool_calling_agent_bench.ros2_agent_tasks import (
21+
GetAllROS2RGBCamerasTask,
22+
GetObjectPositionsTask,
23+
GetROS2DepthCameraTask,
24+
GetROS2MessageTask,
25+
GetROS2RGBCameraTask,
26+
GetROS2TopicsTask,
27+
GetROS2TopicsTask2,
28+
GrabExistingObjectTask,
29+
GrabNotExistingObjectTask,
30+
MoveExistingObjectFrontTask,
31+
MoveExistingObjectLeftTask,
32+
MoveToPointTask,
33+
SwapObjectsTask,
34+
)
35+
36+
tasks: Sequence[ToolCallingAgentTask] = [
37+
GetROS2RGBCameraTask(),
38+
GetROS2TopicsTask(),
39+
GetROS2DepthCameraTask(),
40+
GetAllROS2RGBCamerasTask(),
41+
GetROS2TopicsTask2(),
42+
GetROS2MessageTask(),
43+
MoveToPointTask(args={"x": 1.0, "y": 2.0, "z": 3.0, "task": "grab"}),
44+
MoveToPointTask(args={"x": 1.2, "y": 2.3, "z": 3.4, "task": "drop"}),
45+
GetObjectPositionsTask(
46+
objects={
47+
"carrot": [{"x": 1.0, "y": 2.0, "z": 3.0}],
48+
"apple": [{"x": 4.0, "y": 5.0, "z": 6.0}],
49+
"banana": [
50+
{"x": 7.0, "y": 8.0, "z": 9.0},
51+
{"x": 10.0, "y": 11.0, "z": 12.0},
52+
],
53+
},
54+
),
55+
GrabExistingObjectTask(
56+
object_to_grab="banana",
57+
objects={
58+
"banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
59+
"apple": [
60+
{"x": 4.0, "y": 5.0, "z": 6.0},
61+
{"x": 10.0, "y": 11.0, "z": 12.0},
62+
],
63+
},
64+
),
65+
GrabNotExistingObjectTask(
66+
object_to_grab="apple",
67+
objects={
68+
"banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
69+
"cube": [
70+
{"x": 4.0, "y": 5.0, "z": 6.0},
71+
{"x": 10.0, "y": 11.0, "z": 12.0},
72+
],
73+
},
74+
),
75+
MoveExistingObjectLeftTask(
76+
object_to_grab="banana",
77+
objects={
78+
"banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
79+
"apple": [
80+
{"x": 4.0, "y": 5.0, "z": 6.0},
81+
{"x": 10.0, "y": 11.0, "z": 12.0},
82+
],
83+
},
84+
),
85+
MoveExistingObjectFrontTask(
86+
object_to_grab="banana",
87+
objects={
88+
"banana": [{"x": 7.0, "y": 8.0, "z": 9.0}],
89+
"apple": [
90+
{"x": 4.0, "y": 5.0, "z": 6.0},
91+
{"x": 10.0, "y": 11.0, "z": 12.0},
92+
],
93+
},
94+
),
95+
SwapObjectsTask(
96+
objects={
97+
"banana": [{"x": 1.0, "y": 2.0, "z": 3.0}],
98+
"apple": [{"x": 4.0, "y": 5.0, "z": 6.0}],
99+
},
100+
objects_to_swap=["banana", "apple"],
101+
),
102+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright (C) 2025 Robotec.AI
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import logging
16+
from datetime import datetime
17+
from pathlib import Path
18+
19+
from rai.agents.conversational_agent import create_conversational_agent
20+
from rai.utils.model_initialization import (
21+
get_llm_model,
22+
get_llm_model_config_and_vendor,
23+
)
24+
25+
from rai_bench.examples.tool_calling_agent_bench_tasks import tasks
26+
from rai_bench.tool_calling_agent_bench.agent_bench import ToolCallingAgentBenchmark
27+
28+
if __name__ == "__main__":
29+
current_test_name = Path(__file__).stem
30+
31+
now = datetime.now()
32+
experiment_dir = (
33+
Path("src/rai_bench/rai_bench/experiments")
34+
/ current_test_name
35+
/ now.strftime("%Y-%m-%d_%H-%M-%S")
36+
)
37+
experiment_dir.mkdir(parents=True, exist_ok=True)
38+
log_filename = experiment_dir / "benchmark.log"
39+
results_filename = experiment_dir / "results.csv"
40+
41+
file_handler = logging.FileHandler(log_filename)
42+
file_handler.setLevel(logging.DEBUG)
43+
formatter = logging.Formatter(
44+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
45+
)
46+
file_handler.setFormatter(formatter)
47+
48+
bench_logger = logging.getLogger("Benchmark logger")
49+
bench_logger.setLevel(logging.INFO)
50+
bench_logger.addHandler(file_handler)
51+
52+
agent_logger = logging.getLogger("Agent logger")
53+
agent_logger.setLevel(logging.INFO)
54+
agent_logger.addHandler(file_handler)
55+
56+
for task in tasks:
57+
task.logger = bench_logger
58+
59+
benchmark = ToolCallingAgentBenchmark(
60+
tasks=tasks, logger=bench_logger, results_filename=results_filename
61+
)
62+
63+
model_type = "simple_model"
64+
model_config = get_llm_model_config_and_vendor(model_type=model_type)[0]
65+
model_name = getattr(model_config, model_type)
66+
67+
for task in tasks:
68+
agent = create_conversational_agent(
69+
llm=get_llm_model(model_type=model_type),
70+
tools=task.expected_tools,
71+
system_prompt=task.get_system_prompt(),
72+
logger=agent_logger,
73+
)
74+
benchmark.run_next(agent=agent, model_name=model_name)

0 commit comments

Comments
 (0)