wandb · shawnlewis · Aug 29, 2024 · Aug 27, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install setuptools wheel pytest pyright
+          pip install -r requirements-dev.txt
           pip install -e .
 
       - name: Run tests

diff --git a/programmer/agent.py b/programmer/agent.py
@@ -1,6 +1,7 @@
 from typing import Any, Optional, Union
 from pydantic import Field
 import litellm
+import time
 from openai.types.chat import (
     ChatCompletionMessageParam,
 )
@@ -14,6 +15,10 @@
 from .environment import get_current_environment, EnvironmentSnapshotKey
 
 
+class TimeLimitExceeded(Exception):
+    pass
+
+
 def get_commit_message(history: list[Any]) -> str:
     # Commit message is the most recent message with 'content'
     for i in range(len(history) - 1, -1, -1):
@@ -66,9 +71,10 @@ def step(self, state: AgentState) -> AgentState:
             The new state of the environment.
         """
         Console.step_start("agent", "green")
-        ref = weave.obj_ref(state)
-        if ref:
-            print("state ref:", ref.uri())
+        # Printing this is ugly
+        # ref = weave.obj_ref(state)
+        # if ref:
+        #     print("state ref:", ref.uri())
 
         messages: list[ChatCompletionMessageParam] = [
             {"role": "system", "content": self.system_message},
@@ -124,9 +130,17 @@ def step(self, state: AgentState) -> AgentState:
         return AgentState(history=new_history, env_snapshot_key=snapshot_key)
 
     @weave.op()
-    def run(self, state: AgentState):
+    def run(self, state: AgentState, max_runtime_seconds: int = -1):
+        start_time = time.time()
         while True:
             last_message = state.history[-1]
             if last_message["role"] == "assistant" and "tool_calls" not in last_message:
                 return state
             state = self.step(state)
+            if (
+                max_runtime_seconds > 0
+                and time.time() - start_time > max_runtime_seconds
+            ):
+                raise TimeLimitExceeded(
+                    f"Agent runtime exceeded {max_runtime_seconds}s"
+                )
diff --git a/programmer/containerserver/README.md b/programmer/containerserver/README.md
@@ -0,0 +1,32 @@
+# Container Manager Server
+
+## Build images on server
+
+We use this for running swe-bench locally against containers on a remote server. See [swe-bench README](../swe-bench/README.md) for steps to build the SWE-bench images.
+
+## Run and check server
+
+put cmserver.py on remote machine
+```
+gcloud compute scp --zone "us-west1-a" --project "weave-support-367421" cmserver.py programmer-benchmark2:~/
+```
+
+on remote machine
+
+(just 1 worker for now, there's global state)
+```
+uvicorn cmserver:app --host 0.0.0.0 --port 8000 --workers 1
+```
+
+tunnel from local machine to remote
+```
+gcloud compute ssh --zone "us-west1-a" "programmer-benchmark" --project "weave-support-367421"  -- -NL 8000:localhost:8000
+```
+
+local machine
+```
+python checkserver.py
+```
+
+result on remote machine should be there are no more running containers when done
+
diff --git a/programmer/containerserver/checkserver.py b/programmer/containerserver/checkserver.py
@@ -0,0 +1,129 @@
+import requests
+import threading
+import argparse
+
+# Replace with the actual host and port if different
+BASE_URL = "http://127.0.0.1:8000"
+
+
+def start_container(image_id: str):
+    response = requests.post(f"{BASE_URL}/container/start", json={"image_id": image_id})
+    if response.status_code == 200:
+        return response.json().get("container_id")
+    else:
+        print(f"Failed to start container: {response.text}")
+        return None
+
+
+def run_command(container_id: str, workdir: str, command: str):
+    response = requests.post(
+        f"{BASE_URL}/container/run",
+        json={"container_id": container_id, "workdir": workdir, "command": command},
+    )
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to run command: {response.text}")
+        return None
+
+
+def write_file(container_id: str, file_path: str, file_content: str):
+    response = requests.post(
+        f"{BASE_URL}/container/write_file",
+        json={
+            "container_id": container_id,
+            "file_path": file_path,
+            "file_content": file_content,
+        },
+    )
+    if response.status_code == 200:
+        return response.json().get("status")
+    else:
+        print(f"Failed to write file: {response.text}")
+        return None
+
+
+def read_file(container_id: str, file_path: str):
+    response = requests.post(
+        f"{BASE_URL}/container/read_file",
+        json={"container_id": container_id, "file_path": file_path},
+    )
+    if response.status_code == 200:
+        return response.json().get("file_content")
+    else:
+        print(f"Failed to read file: {response.text}")
+        return None
+
+
+def stop_container(container_id: str, delete: bool):
+    response = requests.post(
+        f"{BASE_URL}/container/stop",
+        json={"container_id": container_id, "delete": delete},
+    )
+    if response.status_code == 200:
+        return response.json().get("status")
+    else:
+        print(f"Failed to stop container: {response.text}")
+        return None
+
+
+def manage_container(image_id: str, container_index: int):
+    print(f"Starting container {container_index}...")
+    container_id = start_container(image_id)
+    if not container_id:
+        print(f"Failed to start container {container_index}")
+        return
+
+    print(f"Started container {container_index} with ID: {container_id}")
+
+    # Run a command inside the container
+    output = run_command(container_id, "/", "ls")
+    if output:
+        print(f"Container {container_index} command output:\n{output}")
+
+    # Write a file inside the container
+    file_path = f"test_{container_index}.txt"
+    file_content = f"Hello, this is a test for container {container_index}."
+    write_status = write_file(container_id, file_path, file_content)
+    if write_status:
+        print(f"Container {container_index} write file status: {write_status}")
+
+    # Read the file back from the container
+    read_content = read_file(container_id, file_path)
+    if read_content:
+        print(f"Container {container_index} file content:\n{read_content}")
+
+    # Stop the container (and delete it)
+    stop_status = stop_container(container_id, delete=True)
+    if stop_status:
+        print(f"Container {container_index} stop status: {stop_status}")
+
+
+def run_parallel_tests(image_id: str, parallelism: int):
+    threads = []
+    for i in range(parallelism):
+        thread = threading.Thread(target=manage_container, args=(image_id, i))
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run parallel container tests")
+    parser.add_argument(
+        "--parallelism",
+        type=int,
+        default=1,
+        help="Number of parallel container operations (default: 1)",
+    )
+    parser.add_argument(
+        "--image-id",
+        type=str,
+        default="sweb.eval.x86_64.sympy__sympy-20590",
+        help="Image ID to test",
+    )
+    args = parser.parse_args()
+
+    run_parallel_tests(args.image_id, args.parallelism)