Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SWEBench Evaluation #19

Merged
merged 28 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
233f65a
Server for running against SWE-bench images, and test script
Aug 27, 2024
d24fd48
Refactor ToolContext so we can talk to RemoteContainerServer
Aug 28, 2024
9332f99
Stateless cmserver
shawnlewis Aug 28, 2024
5047a87
Add temporary programmer-swebench.py script
shawnlewis Aug 28, 2024
43b6c0c
undo changes to programmer.py
shawnlewis Aug 28, 2024
615ad6d
Don't show state ref in output
shawnlewis Aug 28, 2024
d21755f
Standardize toolsets more.
shawnlewis Aug 28, 2024
d9a1707
Remove TODO
shawnlewis Aug 28, 2024
63b30b1
setup conda environment in container
shawnlewis Aug 28, 2024
17a83e8
Working swebench eval function
shawnlewis Aug 28, 2024
037a465
Move data files
shawnlewis Aug 28, 2024
8d92d01
scripts for ingesting existing swe-bench results to weave
shawnlewis Aug 28, 2024
63d85af
Add README
shawnlewis Aug 28, 2024
81fcad2
remove outputs
shawnlewis Aug 28, 2024
2ddd45f
Add examples_v_models script
shawnlewis Aug 28, 2024
a2f7052
Move script
shawnlewis Aug 28, 2024
7561ab8
Move ContainerManager into own dir
shawnlewis Aug 28, 2024
df17bef
Reorganize more
shawnlewis Aug 28, 2024
a6d4b83
more renaming
shawnlewis Aug 28, 2024
bdea5d6
Working Weave Eval
shawnlewis Aug 28, 2024
167f713
script update
shawnlewis Aug 29, 2024
a6b52a2
Default is no run timeout
shawnlewis Aug 29, 2024
5e795bb
Pin Weave
shawnlewis Aug 29, 2024
58abf6a
install dev requirements in ci
shawnlewis Aug 29, 2024
6db0489
install dev requirements in ci
shawnlewis Aug 29, 2024
c28e653
add swebench to requirements dev
shawnlewis Aug 29, 2024
32d7cc8
Fix lint issues
shawnlewis Aug 29, 2024
5e23a10
Tweaks for PR
shawnlewis Aug 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel pytest pyright
pip install -r requirements-dev.txt
pip install -e .

- name: Run tests
Expand Down
22 changes: 18 additions & 4 deletions programmer/agent.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, Optional, Union
from pydantic import Field
import litellm
import time
from openai.types.chat import (
ChatCompletionMessageParam,
)
Expand All @@ -14,6 +15,10 @@
from .environment import get_current_environment, EnvironmentSnapshotKey


class TimeLimitExceeded(Exception):
pass


def get_commit_message(history: list[Any]) -> str:
# Commit message is the most recent message with 'content'
for i in range(len(history) - 1, -1, -1):
Expand Down Expand Up @@ -66,9 +71,10 @@ def step(self, state: AgentState) -> AgentState:
The new state of the environment.
"""
Console.step_start("agent", "green")
ref = weave.obj_ref(state)
if ref:
print("state ref:", ref.uri())
# Printing this is ugly
# ref = weave.obj_ref(state)
# if ref:
# print("state ref:", ref.uri())

messages: list[ChatCompletionMessageParam] = [
{"role": "system", "content": self.system_message},
Expand Down Expand Up @@ -124,9 +130,17 @@ def step(self, state: AgentState) -> AgentState:
return AgentState(history=new_history, env_snapshot_key=snapshot_key)

@weave.op()
def run(self, state: AgentState):
def run(self, state: AgentState, max_runtime_seconds: int = -1):
start_time = time.time()
while True:
last_message = state.history[-1]
if last_message["role"] == "assistant" and "tool_calls" not in last_message:
return state
state = self.step(state)
if (
max_runtime_seconds > 0
and time.time() - start_time > max_runtime_seconds
):
raise TimeLimitExceeded(
f"Agent runtime exceeded {max_runtime_seconds}s"
)
32 changes: 32 additions & 0 deletions programmer/containerserver/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Container Manager Server

## Build images on server

We use this for running swe-bench locally against containers on a remote server. See [swe-bench README](../swe-bench/README.md) for steps to build the SWE-bench images.

## Run and check server

put cmserver.py on remote machine
```
gcloud compute scp --zone "us-west1-a" --project "weave-support-367421" cmserver.py programmer-benchmark2:~/
```

on remote machine

(just 1 worker for now, there's global state)
```
uvicorn cmserver:app --host 0.0.0.0 --port 8000 --workers 1
```

tunnel from local machine to remote
```
gcloud compute ssh --zone "us-west1-a" "programmer-benchmark" --project "weave-support-367421" -- -NL 8000:localhost:8000
```

local machine
```
python checkserver.py
```

result on remote machine should be there are no more running containers when done

129 changes: 129 additions & 0 deletions programmer/containerserver/checkserver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import requests
import threading
import argparse

# Replace with the actual host and port if different
BASE_URL = "http://127.0.0.1:8000"


def start_container(image_id: str):
response = requests.post(f"{BASE_URL}/container/start", json={"image_id": image_id})
if response.status_code == 200:
return response.json().get("container_id")
else:
print(f"Failed to start container: {response.text}")
return None


def run_command(container_id: str, workdir: str, command: str):
response = requests.post(
f"{BASE_URL}/container/run",
json={"container_id": container_id, "workdir": workdir, "command": command},
)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to run command: {response.text}")
return None


def write_file(container_id: str, file_path: str, file_content: str):
response = requests.post(
f"{BASE_URL}/container/write_file",
json={
"container_id": container_id,
"file_path": file_path,
"file_content": file_content,
},
)
if response.status_code == 200:
return response.json().get("status")
else:
print(f"Failed to write file: {response.text}")
return None


def read_file(container_id: str, file_path: str):
response = requests.post(
f"{BASE_URL}/container/read_file",
json={"container_id": container_id, "file_path": file_path},
)
if response.status_code == 200:
return response.json().get("file_content")
else:
print(f"Failed to read file: {response.text}")
return None


def stop_container(container_id: str, delete: bool):
response = requests.post(
f"{BASE_URL}/container/stop",
json={"container_id": container_id, "delete": delete},
)
if response.status_code == 200:
return response.json().get("status")
else:
print(f"Failed to stop container: {response.text}")
return None


def manage_container(image_id: str, container_index: int):
print(f"Starting container {container_index}...")
container_id = start_container(image_id)
if not container_id:
print(f"Failed to start container {container_index}")
return

print(f"Started container {container_index} with ID: {container_id}")

# Run a command inside the container
output = run_command(container_id, "/", "ls")
if output:
print(f"Container {container_index} command output:\n{output}")

# Write a file inside the container
file_path = f"test_{container_index}.txt"
file_content = f"Hello, this is a test for container {container_index}."
write_status = write_file(container_id, file_path, file_content)
if write_status:
print(f"Container {container_index} write file status: {write_status}")

# Read the file back from the container
read_content = read_file(container_id, file_path)
if read_content:
print(f"Container {container_index} file content:\n{read_content}")

# Stop the container (and delete it)
stop_status = stop_container(container_id, delete=True)
if stop_status:
print(f"Container {container_index} stop status: {stop_status}")


def run_parallel_tests(image_id: str, parallelism: int):
threads = []
for i in range(parallelism):
thread = threading.Thread(target=manage_container, args=(image_id, i))
threads.append(thread)
thread.start()

for thread in threads:
thread.join()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run parallel container tests")
parser.add_argument(
"--parallelism",
type=int,
default=1,
help="Number of parallel container operations (default: 1)",
)
parser.add_argument(
"--image-id",
type=str,
default="sweb.eval.x86_64.sympy__sympy-20590",
help="Image ID to test",
)
args = parser.parse_args()

run_parallel_tests(args.image_id, args.parallelism)
Loading
Loading