diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 013d21d..ca60206 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install setuptools wheel pytest pyright
+ pip install -r requirements-dev.txt
pip install -e .
- name: Run tests
diff --git a/programmer/agent.py b/programmer/agent.py
index 603fa81..bdb40fc 100644
--- a/programmer/agent.py
+++ b/programmer/agent.py
@@ -1,6 +1,7 @@
from typing import Any, Optional, Union
from pydantic import Field
import litellm
+import time
from openai.types.chat import (
ChatCompletionMessageParam,
)
@@ -14,6 +15,10 @@
from .environment import get_current_environment, EnvironmentSnapshotKey
+class TimeLimitExceeded(Exception):
+ pass
+
+
def get_commit_message(history: list[Any]) -> str:
# Commit message is the most recent message with 'content'
for i in range(len(history) - 1, -1, -1):
@@ -66,9 +71,10 @@ def step(self, state: AgentState) -> AgentState:
The new state of the environment.
"""
Console.step_start("agent", "green")
- ref = weave.obj_ref(state)
- if ref:
- print("state ref:", ref.uri())
+ # Printing this is ugly
+ # ref = weave.obj_ref(state)
+ # if ref:
+ # print("state ref:", ref.uri())
messages: list[ChatCompletionMessageParam] = [
{"role": "system", "content": self.system_message},
@@ -124,9 +130,17 @@ def step(self, state: AgentState) -> AgentState:
return AgentState(history=new_history, env_snapshot_key=snapshot_key)
@weave.op()
- def run(self, state: AgentState):
+ def run(self, state: AgentState, max_runtime_seconds: int = -1):
+ start_time = time.time()
while True:
last_message = state.history[-1]
if last_message["role"] == "assistant" and "tool_calls" not in last_message:
return state
state = self.step(state)
+ if (
+ max_runtime_seconds > 0
+ and time.time() - start_time > max_runtime_seconds
+ ):
+ raise TimeLimitExceeded(
+ f"Agent runtime exceeded {max_runtime_seconds}s"
+ )
diff --git a/programmer/containerserver/README.md b/programmer/containerserver/README.md
new file mode 100644
index 0000000..3fc4307
--- /dev/null
+++ b/programmer/containerserver/README.md
@@ -0,0 +1,32 @@
+# Container Manager Server
+
+## Build images on server
+
+We use this for running swe-bench locally against containers on a remote server. See [swe-bench README](../swe-bench/README.md) for steps to build the SWE-bench images.
+
+## Run and check server
+
+put cmserver.py on remote machine
+```
+gcloud compute scp --zone "us-west1-a" --project "weave-support-367421" cmserver.py programmer-benchmark2:~/
+```
+
+on remote machine
+
+(just 1 worker for now, there's global state)
+```
+uvicorn cmserver:app --host 0.0.0.0 --port 8000 --workers 1
+```
+
+tunnel from local machine to remote
+```
+gcloud compute ssh --zone "us-west1-a" "programmer-benchmark" --project "weave-support-367421" -- -NL 8000:localhost:8000
+```
+
+local machine
+```
+python checkserver.py
+```
+
+result on remote machine should be there are no more running containers when done
+
diff --git a/programmer/containerserver/checkserver.py b/programmer/containerserver/checkserver.py
new file mode 100644
index 0000000..e214180
--- /dev/null
+++ b/programmer/containerserver/checkserver.py
@@ -0,0 +1,129 @@
+import requests
+import threading
+import argparse
+
+# Replace with the actual host and port if different
+BASE_URL = "http://127.0.0.1:8000"
+
+
+def start_container(image_id: str):
+ response = requests.post(f"{BASE_URL}/container/start", json={"image_id": image_id})
+ if response.status_code == 200:
+ return response.json().get("container_id")
+ else:
+ print(f"Failed to start container: {response.text}")
+ return None
+
+
+def run_command(container_id: str, workdir: str, command: str):
+ response = requests.post(
+ f"{BASE_URL}/container/run",
+ json={"container_id": container_id, "workdir": workdir, "command": command},
+ )
+ if response.status_code == 200:
+ return response.json()
+ else:
+ print(f"Failed to run command: {response.text}")
+ return None
+
+
+def write_file(container_id: str, file_path: str, file_content: str):
+ response = requests.post(
+ f"{BASE_URL}/container/write_file",
+ json={
+ "container_id": container_id,
+ "file_path": file_path,
+ "file_content": file_content,
+ },
+ )
+ if response.status_code == 200:
+ return response.json().get("status")
+ else:
+ print(f"Failed to write file: {response.text}")
+ return None
+
+
+def read_file(container_id: str, file_path: str):
+ response = requests.post(
+ f"{BASE_URL}/container/read_file",
+ json={"container_id": container_id, "file_path": file_path},
+ )
+ if response.status_code == 200:
+ return response.json().get("file_content")
+ else:
+ print(f"Failed to read file: {response.text}")
+ return None
+
+
+def stop_container(container_id: str, delete: bool):
+ response = requests.post(
+ f"{BASE_URL}/container/stop",
+ json={"container_id": container_id, "delete": delete},
+ )
+ if response.status_code == 200:
+ return response.json().get("status")
+ else:
+ print(f"Failed to stop container: {response.text}")
+ return None
+
+
+def manage_container(image_id: str, container_index: int):
+ print(f"Starting container {container_index}...")
+ container_id = start_container(image_id)
+ if not container_id:
+ print(f"Failed to start container {container_index}")
+ return
+
+ print(f"Started container {container_index} with ID: {container_id}")
+
+ # Run a command inside the container
+ output = run_command(container_id, "/", "ls")
+ if output:
+ print(f"Container {container_index} command output:\n{output}")
+
+ # Write a file inside the container
+ file_path = f"test_{container_index}.txt"
+ file_content = f"Hello, this is a test for container {container_index}."
+ write_status = write_file(container_id, file_path, file_content)
+ if write_status:
+ print(f"Container {container_index} write file status: {write_status}")
+
+ # Read the file back from the container
+ read_content = read_file(container_id, file_path)
+ if read_content:
+ print(f"Container {container_index} file content:\n{read_content}")
+
+ # Stop the container (and delete it)
+ stop_status = stop_container(container_id, delete=True)
+ if stop_status:
+ print(f"Container {container_index} stop status: {stop_status}")
+
+
+def run_parallel_tests(image_id: str, parallelism: int):
+ threads = []
+ for i in range(parallelism):
+ thread = threading.Thread(target=manage_container, args=(image_id, i))
+ threads.append(thread)
+ thread.start()
+
+ for thread in threads:
+ thread.join()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Run parallel container tests")
+ parser.add_argument(
+ "--parallelism",
+ type=int,
+ default=1,
+ help="Number of parallel container operations (default: 1)",
+ )
+ parser.add_argument(
+ "--image-id",
+ type=str,
+ default="sweb.eval.x86_64.sympy__sympy-20590",
+ help="Image ID to test",
+ )
+ args = parser.parse_args()
+
+ run_parallel_tests(args.image_id, args.parallelism)
diff --git a/programmer/containerserver/cmserver.py b/programmer/containerserver/cmserver.py
new file mode 100644
index 0000000..2d8dd3e
--- /dev/null
+++ b/programmer/containerserver/cmserver.py
@@ -0,0 +1,178 @@
+import os
+import tarfile
+from io import BytesIO
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+import docker
+from docker.errors import NotFound
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+
+# DockerContainerManager class
+class DockerContainerManager:
+ def __init__(self):
+ self.client = docker.from_env()
+ self.executor = ThreadPoolExecutor()
+
+ async def start_container(self, image_id: str):
+ loop = asyncio.get_event_loop()
+ container = await loop.run_in_executor(
+ self.executor, self._run_container, image_id
+ )
+ return container.short_id
+
+ def _run_container(self, image_id: str):
+ return self.client.containers.run(
+ image_id, detach=True, command="tail -f /dev/null"
+ )
+
+ def _get_container(self, container_id: str):
+ return self.client.containers.get(container_id)
+
+ async def run_command(self, container_id: str, workdir: str, command: str):
+ loop = asyncio.get_event_loop()
+ exec_result = await loop.run_in_executor(
+ self.executor, self._exec_run, container_id, command, workdir
+ )
+ return {
+ "exit_code": exec_result.exit_code,
+ "output": exec_result.output.decode("utf-8"),
+ }
+
+ def _exec_run(self, container_id: str, command: str, workdir: str):
+ container = self._get_container(container_id)
+ return container.exec_run(command, workdir=workdir)
+
+ async def write_file(self, container_id: str, file_path: str, file_content: str):
+ file_path = os.path.join("/", file_path)
+ container = self._get_container(container_id)
+ tarstream = BytesIO()
+ with tarfile.open(fileobj=tarstream, mode="w") as tar:
+ tarinfo = tarfile.TarInfo(name=os.path.basename(file_path))
+ tarinfo.size = len(file_content)
+ tar.addfile(tarinfo, BytesIO(file_content.encode("utf-8")))
+ tarstream.seek(0)
+
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(
+ self.executor,
+ container.put_archive,
+ os.path.dirname(file_path),
+ tarstream,
+ )
+
+ async def read_file(self, container_id: str, file_path: str):
+ container = self._get_container(container_id)
+ loop = asyncio.get_event_loop()
+ bits, _ = await loop.run_in_executor(
+ self.executor, container.get_archive, file_path
+ )
+ file_content = BytesIO()
+ for chunk in bits:
+ file_content.write(chunk)
+ file_content.seek(0)
+ with tarfile.open(fileobj=file_content) as tar:
+ member = tar.getmembers()[0]
+ extract_result = tar.extractfile(member)
+ if extract_result is None:
+ raise Exception(f"Unexpected tar.extractfile result for: {file_path}")
+ file_data = extract_result.read()
+ return file_data.decode("utf-8")
+
+ async def stop_container(self, container_id: str, delete: bool = False):
+ container = self._get_container(container_id)
+ loop = asyncio.get_event_loop()
+ await loop.run_in_executor(self.executor, container.stop)
+ if delete:
+ await loop.run_in_executor(self.executor, container.remove)
+
+
+# FastAPI setup
+app = FastAPI()
+container_manager = DockerContainerManager()
+
+
+class StartContainerRequest(BaseModel):
+ image_id: str
+
+
+class StopContainerRequest(BaseModel):
+ container_id: str
+ delete: bool
+
+
+class CommandRequest(BaseModel):
+ container_id: str
+ workdir: str
+ command: str
+
+
+class FileRequest(BaseModel):
+ container_id: str
+ file_path: str
+ file_content: str
+
+
+class FilePathRequest(BaseModel):
+ container_id: str
+ file_path: str
+
+
+@app.post("/container/start")
+async def start_container(request: StartContainerRequest):
+ try:
+ container_id = await container_manager.start_container(request.image_id)
+ return {"container_id": container_id}
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/run")
+async def run_command(request: CommandRequest):
+ try:
+ result = await container_manager.run_command(
+ request.container_id, request.workdir, request.command
+ )
+ return {"exit_code": result["exit_code"], "output": result["output"]}
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/write_file")
+async def write_file(request: FileRequest):
+ try:
+ await container_manager.write_file(
+ request.container_id, request.file_path, request.file_content
+ )
+ return {"status": "file written"}
+ except NotFound as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/read_file")
+async def read_file(request: FilePathRequest):
+ try:
+ file_content = await container_manager.read_file(
+ request.container_id, request.file_path
+ )
+ return {"file_content": file_content}
+ except NotFound as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/stop")
+async def stop_container(request: StopContainerRequest):
+ try:
+ await container_manager.stop_container(request.container_id, request.delete)
+ return {"status": "container stopped"}
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+# To run the server, use:
+# uvicorn your_file_name:app --host 0.0.0.0 --port 8000 --workers 4
diff --git a/programmer/evals/eval_repeated_edits.py b/programmer/evals/eval_repeated_edits.py
index 3443a4c..9d63819 100644
--- a/programmer/evals/eval_repeated_edits.py
+++ b/programmer/evals/eval_repeated_edits.py
@@ -10,14 +10,14 @@
from ..agent import AgentState, Agent
from ..config import agent, agent_claude, agent_claude_replace, agent_replace
-from ..tools import tool_context
+from ..tools import tool_context, LocalToolContext
# @pytest.fixture
@contextmanager
def tempdir():
with tempfile.TemporaryDirectory() as dir_:
- with tool_context(dir_) as tc:
+ with tool_context(LocalToolContext(dir_)) as tc:
yield tc
diff --git a/programmer/programmer.py b/programmer/programmer.py
index 739db86..78f9b1a 100644
--- a/programmer/programmer.py
+++ b/programmer/programmer.py
@@ -10,9 +10,9 @@
import weave
-from .agent import AgentState, get_commit_message
+from .agent import Agent, AgentState, get_commit_message
from .console import Console
-from .config import agent
+from .config import agent_replace
from .environment import (
environment_session,
restore_environment,
@@ -34,9 +34,10 @@ def get_user_input():
@weave.op
def user_input_step(state: AgentState) -> AgentState:
Console.step_start("user_input", "purple")
- ref = weave.obj_ref(state)
- if ref:
- print("state ref:", ref.uri())
+ # Printing this is ugly
+ # ref = weave.obj_ref(state)
+ # if ref:
+ # print("state ref:", ref.uri())
user_input = get_user_input()
environment = get_current_environment()
history = state.history + [
@@ -63,7 +64,7 @@ def make_environment():
@weave.op
-def session(agent_state: AgentState):
+def session(agent: Agent, agent_state: AgentState):
call = weave.get_current_call()
session_id = None
@@ -160,7 +161,7 @@ def main():
],
)
- session(state)
+ session(agent_replace, state)
if __name__ == "__main__":
diff --git a/programmer/swe-bench/code.patch b/programmer/swe-bench/code.patch
deleted file mode 100644
index 9d29e02..0000000
--- a/programmer/swe-bench/code.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/sympy/core/numbers.py b/sympy/core/numbers.py
---- a/sympy/core/numbers.py
-+++ b/sympy/core/numbers.py
-@@ -1624,10 +1624,11 @@ def __new__(cls, p, q=None, gcd=None):
-
- q = 1
- gcd = 1
-+ Q = 1
-
- if not isinstance(p, SYMPY_INTS):
- p = Rational(p)
-- q *= p.q
-+ Q *= p.q
- p = p.p
- else:
- p = int(p)
-@@ -1635,9 +1636,10 @@ def __new__(cls, p, q=None, gcd=None):
- if not isinstance(q, SYMPY_INTS):
- q = Rational(q)
- p *= q.q
-- q = q.p
-+ Q *= q.p
- else:
-- q = int(q)
-+ Q *= int(q)
-+ q = Q
-
- # p and q are now ints
- if q == 0:
diff --git a/programmer/swe-bench/problem.txt b/programmer/swe-bench/problem.txt
deleted file mode 100644
index ecc84ee..0000000
--- a/programmer/swe-bench/problem.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-Rational calc value error
-python 3.11, sympy 1.11.1
-when calc Rational('0.5', '100'), the value is 1/100100; but Rational(0.5, 100) the value is 1/200, this value is the true value, and the version of sympy 1.8 is normal
-
-
diff --git a/programmer/swe-bench/test_code.patch b/programmer/swe-bench/test_code.patch
deleted file mode 100644
index 6fa0c19..0000000
--- a/programmer/swe-bench/test_code.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-diff --git a/sympy/core/tests/test_numbers.py b/sympy/core/tests/test_numbers.py
---- a/sympy/core/tests/test_numbers.py
-+++ b/sympy/core/tests/test_numbers.py
-@@ -366,6 +366,13 @@ def test_Rational_new():
- assert n.q == 4
- assert n.p == -2
-
-+def test_issue_24543():
-+ for p in ('1.5', 1.5, 2):
-+ for q in ('1.5', 1.5, 2):
-+ assert Rational(p, q).as_numer_denom() == Rational('%s/%s'%(p,q)).as_numer_denom()
-+
-+ assert Rational('0.5', '100') == Rational(1, 200)
-+
-
- def test_Number_new():
- """"
diff --git a/programmer/swebench/README.md b/programmer/swebench/README.md
new file mode 100644
index 0000000..582e5a7
--- /dev/null
+++ b/programmer/swebench/README.md
@@ -0,0 +1,63 @@
+# SWE Bench programmer evaluation
+
+## Build SWE-bench images
+
+First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager.
+
+```
+python -m swebench.harness.run_evaluation \
+ --predictions_path gold \
+ --max_workers 24 \
+ --run_id validate-gold \
+ --dataset_name princeton-nlp/SWE-bench_Verified \
+ --cache_level instance
+```
+
+
+## remote machine setup instructions on gcp VM ubuntu 20.04
+
+```
+
+sudo snap install docker
+sudo groupadd docker
+sudo usermod -aG docker $USER
+sudo chown root:docker /var/run/docker.sock
+sudo chmod 660 /var/run/docker.sock
+
+sudo apt update
+sudo apt install -y \
+ build-essential \
+ libbz2-dev \
+ libreadline-dev \
+ libssl-dev \
+ zlib1g-dev \
+ libsqlite3-dev \
+ libffi-dev \
+ libncursesw5-dev \
+ libgdbm-dev \
+ liblzma-dev \
+ tk-dev \
+ libdb-dev \
+ libexpat1-dev \
+ libmpdec-dev \
+ libxml2-dev \
+ libxmlsec1-dev \
+ libffi-dev \
+ liblzma-dev
+
+# pyenv
+curl https://pyenv.run | bash
+echo 'export PYENV_ROOT="$HOME/.pyenv"
+[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc
+## exit and re-log in
+
+pyenv install 3.10.12
+pyenv virtualenv 3.10.12 swe-bench
+
+git clone https://github.com/princeton-nlp/SWE-bench.git
+cd SWE-bench
+pyenv local swe-bench
+pip install -e .
+```
\ No newline at end of file
diff --git a/programmer/swebench/__init__.py b/programmer/swebench/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/programmer/swe-bench/ensembled_annotations_public.csv b/programmer/swebench/data/ensembled_annotations_public.csv
similarity index 100%
rename from programmer/swe-bench/ensembled_annotations_public.csv
rename to programmer/swebench/data/ensembled_annotations_public.csv
diff --git a/programmer/swe-bench/samples_with_3_annotations_public.csv b/programmer/swebench/data/samples_with_3_annotations_public.csv
similarity index 100%
rename from programmer/swe-bench/samples_with_3_annotations_public.csv
rename to programmer/swebench/data/samples_with_3_annotations_public.csv
diff --git a/programmer/swe-bench/swebench-verified.parquet b/programmer/swebench/data/swebench-verified.parquet
similarity index 100%
rename from programmer/swe-bench/swebench-verified.parquet
rename to programmer/swebench/data/swebench-verified.parquet
diff --git a/programmer/swebench/evaluate.py b/programmer/swebench/evaluate.py
new file mode 100644
index 0000000..38a5725
--- /dev/null
+++ b/programmer/swebench/evaluate.py
@@ -0,0 +1,88 @@
+import asyncio
+import pandas as pd
+from typing import Optional
+import random
+import weave
+
+from .swebench_model import SWEBenchProgrammerModel
+from .score import score_swebench
+from ..agent import Agent
+from ..config import SYSTEM_MESSAGE
+from ..tools import (
+ list_files,
+ run_command,
+ view_image,
+ read_lines_from_file,
+ replace_lines_in_file,
+)
+
+
+def load_raw_dataset(name: str, split: str):
+ return pd.read_parquet(
+ f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet"
+ )
+
+
+def load_weave_dataset(
+ name: str,
+ split: str,
+ limit: Optional[int] = None,
+ instance_ids: Optional[list[str]] = None,
+ shuffle_seed: Optional[int] = None,
+):
+ df = load_raw_dataset(name, split)
+
+ data_list = df.to_dict("records")
+ if shuffle_seed is not None:
+ random.seed(shuffle_seed)
+ random.shuffle(data_list)
+ data_list = [
+ r for r in data_list if instance_ids is None or r["instance_id"] in instance_ids
+ ]
+ data_list = data_list[:limit] if limit else data_list
+ data_list = [{"instance": r} for r in data_list]
+
+ return weave.Dataset(name=f"Verified-{limit}-{shuffle_seed}", rows=data_list) # type: ignore
+
+
+def main():
+ weave.init("weavedev-programmereval1")
+ instance_ids = [
+ "django__django-16569",
+ "django__django-11099",
+ "scikit-learn__scikit-learn-12585",
+ "django__django-13658",
+ "django__django-9296",
+ "astropy__astropy-14309",
+ "django__django-12155",
+ "django__django-16527",
+ "sympy__sympy-24213",
+ "django__django-11066",
+ ]
+ # ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids)
+ ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42)
+ eval = weave.Evaluation(
+ name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5
+ )
+
+ model = SWEBenchProgrammerModel(
+ agent=Agent(
+ model_name="gpt-4o-2024-08-06",
+ temperature=0.7,
+ system_message=SYSTEM_MESSAGE,
+ tools=[
+ list_files,
+ run_command,
+ view_image,
+ read_lines_from_file,
+ replace_lines_in_file,
+ ],
+ ),
+ max_runtime_seconds=180,
+ )
+ res = asyncio.run(eval.evaluate(model))
+ print("RES", res)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/programmer/swebench/ingest/README.md b/programmer/swebench/ingest/README.md
new file mode 100644
index 0000000..31b6eee
--- /dev/null
+++ b/programmer/swebench/ingest/README.md
@@ -0,0 +1,3 @@
+# weave swe-bench eval ingestion
+
+these scripts slurp existing https://github.com/swe-bench/experiments results into weave evals
diff --git a/programmer/swebench/ingest/ingest_eval.py b/programmer/swebench/ingest/ingest_eval.py
new file mode 100644
index 0000000..d000e0b
--- /dev/null
+++ b/programmer/swebench/ingest/ingest_eval.py
@@ -0,0 +1,182 @@
+import argparse
+import os
+import sys
+import asyncio
+import json
+import contextvars
+from rich import print
+
+import weave
+from make_dataset import load_weave_dataset
+
+
+context_var = contextvars.ContextVar("context", default={})
+
+
+def load_instance_eval_file(
+ experiments_repo_path, dataset_name, model_name, instance_id, file_name
+):
+ dataset_name_short = dataset_name.split("_")[1].lower()
+ file_path = os.path.join(
+ experiments_repo_path,
+ "evaluation",
+ dataset_name_short,
+ model_name,
+ "logs",
+ instance_id,
+ file_name,
+ )
+ print(f"Loading file: {file_path}")
+
+ if os.path.exists(file_path):
+ with open(file_path, "r") as file:
+ return file.read()
+ else:
+ return None
+
+
+def load_instance_eval_from_logs(
+ experiments_repo_path, dataset_name, model_name, instance_id
+):
+ report_json_file = load_instance_eval_file(
+ experiments_repo_path,
+ dataset_name,
+ model_name,
+ instance_id,
+ "report.json",
+ )
+ report_json = None
+ if report_json_file is not None:
+ report_json = json.loads(report_json_file).get(instance_id)
+ no_report = False
+ if report_json is None:
+ no_report = True
+
+ return {
+ "patch": load_instance_eval_file(
+ experiments_repo_path, dataset_name, model_name, instance_id, "patch.diff"
+ ),
+ "report": report_json,
+ "no_report": no_report,
+ }
+
+
+def load_instance_eval_from_results(
+ experiments_repo_path, dataset_name, model_name, instance_id
+):
+ dataset_name_short = dataset_name.split("_")[1].lower()
+ file_path = os.path.join(
+ experiments_repo_path,
+ "evaluation",
+ dataset_name_short,
+ model_name,
+ "results",
+ "results.json",
+ )
+ with open(file_path, "r") as file:
+ results = json.loads(file.read())
+ summary = {}
+ for k, instance_ids in results.items():
+ summary[k] = instance_id in instance_ids
+
+ return summary
+
+
+class SWEBenchOfflineModel(weave.Model):
+ @weave.op
+ def predict(self, instance_id: str):
+ context = context_var.get()
+ experiments_repo_path = context.get("experiments_repo_path")
+ dataset_name = context.get("dataset_name")
+ return load_instance_eval_from_results(
+ experiments_repo_path, dataset_name, self.name, instance_id
+ )
+
+
+@weave.op
+def score_from_logs(model_output: dict):
+ result = {}
+ if model_output.get("report"):
+ result.update(model_output["report"])
+ result["no_report"] = model_output["no_report"]
+ return result
+
+
+@weave.op
+def score(model_output: dict):
+ return model_output
+
+
+def ingest_eval(experiments_repo_path, dataset_name, model_name):
+ print(f"Ingesting evaluation logs for:")
+ print(f"Dataset: {dataset_name}")
+ print(f"Model: {model_name}")
+ print(f"From repository: {experiments_repo_path}")
+
+ dataset = load_weave_dataset(dataset_name, "test")
+ eval = weave.Evaluation(name=dataset_name, dataset=dataset, scorers=[score])
+
+ context_var.set(
+ {
+ "experiments_repo_path": experiments_repo_path,
+ "dataset_name": dataset_name,
+ }
+ )
+
+ model = SWEBenchOfflineModel(name=model_name)
+ # result, call = asyncio.run(eval.evaluate.call(eval, model))
+ result = asyncio.run(eval.evaluate(model))
+
+ print(result)
+ # call.set_display_name(model_name)
+
+
+def ingest_evals(experiments_repo_path, dataset_name):
+ dataset_name_short = dataset_name.split("_")[1].lower()
+ models_dir = os.path.join(experiments_repo_path, "evaluation", dataset_name_short)
+ for model_name in os.listdir(models_dir):
+ ingest_eval(experiments_repo_path, dataset_name, model_name)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Ingest evaluation logs into Weave.")
+ parser.add_argument(
+ "--experiments_repo_path", help="Path to the experiments repository"
+ )
+ parser.add_argument(
+ "--dataset_name",
+ choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"],
+ default="SWE-bench_Verified",
+ help="Name of the dataset",
+ )
+ parser.add_argument("--model_name", help="Name of the model")
+
+ args = parser.parse_args()
+
+ if not args.experiments_repo_path or not os.path.exists(args.experiments_repo_path):
+ print(
+ f"Error: Experiments repository path does not exist: {args.experiments_repo_path}"
+ )
+ sys.exit(1)
+
+ # Initialize Weave
+ weave.init("weavedev-swebench5")
+
+ if args.model_name:
+ ingest_eval(args.experiments_repo_path, args.dataset_name, args.model_name)
+ else:
+ ingest_evals(args.experiments_repo_path, args.dataset_name)
+
+
+if __name__ == "__main__":
+ main()
+ # from rich import print
+
+ # print(
+ # load_instance_eval(
+ # "/Users/shawnlewis/code/experiments",
+ # "SWE-bench_Verified",
+ # "20240620_sweagent_claude3.5sonnet",
+ # "sympy__sympy-24661",
+ # )
+ # )
diff --git a/programmer/swebench/ingest/make_dataset.py b/programmer/swebench/ingest/make_dataset.py
new file mode 100644
index 0000000..a92fe03
--- /dev/null
+++ b/programmer/swebench/ingest/make_dataset.py
@@ -0,0 +1,76 @@
+import argparse
+import sys
+from typing import Optional
+import pandas as pd
+import weave
+
+
+splits = {
+ "dev": "data/dev-00000-of-00001.parquet",
+ "test": "data/test-00000-of-00001.parquet",
+ "train": "data/train-00000-of-00001.parquet",
+}
+
+
+def load_raw_dataset(name: str, split: str):
+ return pd.read_parquet(
+ f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet"
+ )
+
+
+def load_weave_dataset(name: str, split: str, limit: Optional[int] = None):
+ df = load_raw_dataset(name, split)
+
+ data_list = df.to_dict("records")
+ data_list = data_list[:limit] if limit else data_list
+
+ return weave.Dataset(name=f"Verified-{limit}", rows=data_list) # type: ignore
+
+
+def main(dataset_name="SWE-bench_Verified", split="test"):
+ valid_datasets = ["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"]
+ valid_splits = ["dev", "test", "train"]
+
+ if dataset_name not in valid_datasets:
+ print(f"Error: Invalid dataset name. Choose from {', '.join(valid_datasets)}")
+ sys.exit(1)
+
+ if split not in valid_splits:
+ print(f"Error: Invalid split. Choose from {', '.join(valid_splits)}")
+ sys.exit(1)
+
+ print(f"Creating dataset: {dataset_name}")
+ print(f"Split: {split}")
+
+ weave.init("weavedev-swebench1")
+
+ df = load_raw_dataset(dataset_name, split)
+
+ data_list = df.to_dict("records")
+
+ dataset = weave.Dataset(rows=data_list) # type: ignore
+
+ weave.publish(dataset, f"{dataset_name}_{split}")
+
+ print(f"Dataset '{dataset_name}_{split}' created and saved successfully.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Create a dataset with specified name and split."
+ )
+ parser.add_argument(
+ "--dataset_name",
+ choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"],
+ default="SWE-bench_Verified",
+ help="Name of the dataset to create",
+ )
+ parser.add_argument(
+ "--split",
+ choices=["dev", "test", "train"],
+ default="test",
+ help="Split of the dataset to create",
+ )
+
+ args = parser.parse_args()
+ main(args.dataset_name, args.split)
diff --git a/programmer/swebench/ingest/requirements.txt b/programmer/swebench/ingest/requirements.txt
new file mode 100644
index 0000000..ac11f68
--- /dev/null
+++ b/programmer/swebench/ingest/requirements.txt
@@ -0,0 +1,4 @@
+weave
+pandas
+fsspec
+huggingface_hub
\ No newline at end of file
diff --git a/programmer/swebench/run_instance.py b/programmer/swebench/run_instance.py
new file mode 100644
index 0000000..f8846d5
--- /dev/null
+++ b/programmer/swebench/run_instance.py
@@ -0,0 +1,52 @@
+import os
+import argparse
+import pandas as pd
+
+from rich import print
+
+import weave
+
+from ..weave_next.api import init_local_client
+from ..settings_manager import SettingsManager
+
+from ..swebench.swebench_model import SWEBenchProgrammerModel
+from ..swebench.score import score_swebench
+from ..config import agent_replace
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Programmer")
+ parser.add_argument(
+ "--instance_id", type=str, help="The instance id to run", required=True
+ )
+
+ # Initialize settings
+ SettingsManager.initialize_settings()
+ logging_mode = SettingsManager.get_setting("weave_logging")
+ if logging_mode == "cloud":
+ curdir = os.path.basename(os.path.abspath(os.curdir))
+ weave.init(f"programmer-{curdir}")
+ elif logging_mode == "local":
+ init_local_client(os.path.join(SettingsManager.PROGRAMMER_DIR, "weave.db"))
+
+ args = parser.parse_args()
+
+ df = pd.read_parquet("programmer/swebench/data/swebench-verified.parquet")
+
+ instance_id = args.instance_id
+ instance = df[df["instance_id"] == instance_id].iloc[0]
+ problem_statement = instance["problem_statement"]
+
+ print("PROBLEM STATEMENT\n", problem_statement)
+ print()
+ print("SOLUTION\n", instance["patch"])
+ print()
+
+ model = SWEBenchProgrammerModel(agent=agent_replace)
+ model_output = model.predict(instance)
+ score = score_swebench(instance, model_output["answer"])
+ print("SCORE\n", score)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/programmer/swebench/score.py b/programmer/swebench/score.py
new file mode 100644
index 0000000..e9cd799
--- /dev/null
+++ b/programmer/swebench/score.py
@@ -0,0 +1,58 @@
+from typing import Any
+from swebench.harness.test_spec import make_test_spec
+from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
+from swebench.harness.grading import get_eval_tests_report, get_resolution_status
+from swebench.harness.constants import (
+ FAIL_TO_PASS,
+ KEY_INSTANCE_ID,
+ PASS_TO_PASS,
+ ResolvedStatus,
+ SWEbenchInstance,
+)
+
+from ..tools import RemoteContainerToolContext
+
+
+def score_swebench(instance: SWEbenchInstance, model_output):
+ patch = model_output["answer"]
+ tc = RemoteContainerToolContext(
+ "http://localhost:8000",
+ "/testbed",
+ "source /opt/miniconda3/bin/activate && conda activate testbed && ",
+ )
+
+ result: dict[str, Any] = {"patch_successfully_applied": False, "resolved": False}
+
+ ts = make_test_spec(instance)
+ container_id = f"sweb.eval.x86_64.{ts.instance_id}"
+ with tc.context(container_id):
+ print("EVAL SCRIPT\n", ts.eval_script)
+
+ tc.write_file("/tmp/patch.diff", patch)
+ patch_result = tc.run_command("git apply -v /tmp/patch.diff")
+ if patch_result["exit_code"] == 0:
+ result["patch_successfully_applied"] = True
+ print("PATCH RESULT\n", patch_result)
+
+ tc.write_file("/eval.sh", ts.eval_script)
+ test_command_results = tc.run_command("chmod +x /eval.sh && /eval.sh")
+ tc_output = test_command_results["output"]
+
+ repo = "-".join(
+ ts.instance_id.replace("__", "/").split("-")[:-1]
+ ) # e.g. scikit-learn/scikit-learn
+ log_parser = MAP_REPO_TO_PARSER[repo]
+ test_name_to_passfail = log_parser(tc_output)
+
+ eval_ref = {
+ KEY_INSTANCE_ID: ts.instance_id,
+ FAIL_TO_PASS: ts.FAIL_TO_PASS,
+ PASS_TO_PASS: ts.PASS_TO_PASS,
+ }
+
+ report = get_eval_tests_report(test_name_to_passfail, eval_ref)
+ resolved = get_resolution_status(report) == ResolvedStatus.FULL.value
+
+ result.update({"resolved": resolved, "tests_status": report})
+
+ return result
diff --git a/programmer/swebench/scripts/example_v_models.py b/programmer/swebench/scripts/example_v_models.py
new file mode 100644
index 0000000..199b07d
--- /dev/null
+++ b/programmer/swebench/scripts/example_v_models.py
@@ -0,0 +1,56 @@
+# using existing swe-bench results logged to weave (see ingest dir),
+# produce a table with instance_id as rows, and models as columns.
+# useful for finding easy / hard examples
+
+import sys
+import pandas as pd
+
+import weave
+
+from ...weave_next.weave_query import calls
+
+
+def main():
+ if len(sys.argv) > 1:
+ wc = weave.init("weavedev-swebench5")
+ c = calls(wc, "Evaluation.predict_and_score", expand_refs=["inputs.example"])
+ df = c.to_pandas()
+
+ df.to_parquet("verified.parquet", engine="pyarrow")
+ else:
+ df = pd.read_parquet("verified.parquet")
+ # Pivot the dataframe
+ pivot_df = df.pivot(
+ index="inputs.example.instance_id",
+ columns="inputs.model",
+ values="output.model_output.resolved",
+ )
+
+ # Extract model names from the column names
+ pivot_df.columns = pivot_df.columns.str.extract(r"object/(.+):")[0]
+
+ # Count models with resolved True for each instance
+ pivot_df["models_resolved_true"] = pivot_df.apply(lambda row: row.sum(), axis=1)
+
+ # Move the count column to the leftmost position
+ cols = pivot_df.columns.tolist()
+ cols = cols[-1:] + cols[:-1]
+ pivot_df = pivot_df[cols]
+
+ # Sort the pivot table by 'models_resolved_true' in descending order
+ pivot_df = pivot_df.sort_values(by="models_resolved_true", ascending=False) # type: ignore
+
+ # Sort columns by the model that got the most resolved
+ model_success_count = pivot_df.sum().sort_values(ascending=False)
+ sorted_columns = ["models_resolved_true"] + model_success_count.index.tolist()
+ pivot_df = pivot_df[sorted_columns]
+
+ # Display the first few rows of the resulting table
+ print(pivot_df.head())
+
+ # Optionally, save the pivot table to a new file
+ pivot_df.to_csv("pivot_table.csv")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/programmer/swe-bench/swebench-difficulties.py b/programmer/swebench/scripts/verified_difficulty_labels.py
similarity index 97%
rename from programmer/swe-bench/swebench-difficulties.py
rename to programmer/swebench/scripts/verified_difficulty_labels.py
index 6a86e57..9c3068b 100644
--- a/programmer/swe-bench/swebench-difficulties.py
+++ b/programmer/swebench/scripts/verified_difficulty_labels.py
@@ -1,5 +1,6 @@
# Quick script for viewing swebench examples against
# annotated difficulties.
+# TODO: update for new file paths (in ../data)
import pandas as pd
import textwrap
diff --git a/programmer/swebench/swebench_model.py b/programmer/swebench/swebench_model.py
new file mode 100644
index 0000000..53805d6
--- /dev/null
+++ b/programmer/swebench/swebench_model.py
@@ -0,0 +1,41 @@
+import weave
+
+from ..agent import Agent, AgentState, TimeLimitExceeded
+from ..tools import RemoteContainerToolContext
+
+
+class SWEBenchProgrammerModel(weave.Model):
+ agent: Agent
+ max_runtime_seconds: int = 60
+
+ def predict(self, instance):
+ instance_id = instance["instance_id"]
+ problem_statement = instance["problem_statement"]
+ initial_prompt = f"""You are in a checkout of the a git repo. Please identify and fix the issue described in the problem statement.
+
+
+{problem_statement}
+"""
+ state = AgentState(
+ history=[
+ {
+ "role": "user",
+ "content": initial_prompt,
+ },
+ ],
+ )
+
+ tc = RemoteContainerToolContext(
+ "http://localhost:8000",
+ "/testbed",
+ "source /opt/miniconda3/bin/activate && conda activate testbed && ",
+ )
+ container_id = f"sweb.eval.x86_64.{instance_id}"
+ with tc.context(container_id):
+ try:
+ self.agent.run(state, max_runtime_seconds=self.max_runtime_seconds)
+ except TimeLimitExceeded:
+ return {"errorcode": "runtime", "answer": ""}
+ answer_result = tc.run_command("git diff")
+ answer = answer_result["output"]
+ return {"answer": answer}
diff --git a/programmer/tests/test_file_line_tools.py b/programmer/tests/test_file_line_tools.py
index 16cec22..eec5f8e 100644
--- a/programmer/tests/test_file_line_tools.py
+++ b/programmer/tests/test_file_line_tools.py
@@ -1,22 +1,20 @@
import os
import pytest
from tempfile import TemporaryDirectory
-from programmer.tools import read_lines_from_file, replace_lines_in_file
+from programmer.tools import read_lines_from_file, replace_lines_in_file, LocalToolContext, tool_context, get_current_context
@pytest.fixture()
-def temp_dir():
+def tempdir_tool_context():
with TemporaryDirectory() as tmpdir:
- yield tmpdir
+ with tool_context(LocalToolContext(tmpdir)) as tc:
+ yield tc
@pytest.fixture()
-def test_file_path(temp_dir):
- file_path = os.path.join(temp_dir, "test_file.txt")
- with open(file_path, "w") as f:
- f.write(
- "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n"
- )
+def test_file_path(tempdir_tool_context):
+ file_path = "test_file.txt"
+ tempdir_tool_context.write_file(file_path, "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n")
yield file_path
@@ -39,7 +37,7 @@ def test_read_lines_from_file(test_file_path):
read_lines_from_file(test_file_path, 11)
-def test_replace_lines_in_file(temp_dir, test_file_path):
+def test_replace_lines_in_file(test_file_path):
# Valid replacement
result = replace_lines_in_file(
test_file_path,
@@ -54,8 +52,7 @@ def test_replace_lines_in_file(temp_dir, test_file_path):
assert "10:Line 10\n" in result
# Replacement with a new file
- new_file_path = os.path.join(temp_dir, "new_test_file.txt")
- result = replace_lines_in_file(new_file_path, 1, 0, "", "First Line\nSecond Line\n")
+ result = replace_lines_in_file("new_test_file.txt", 1, 0, "", "First Line\nSecond Line\n")
assert "1:First Line\n" in result
assert "2:Second Line\n" in result
@@ -63,18 +60,16 @@ def test_replace_lines_in_file(temp_dir, test_file_path):
# Test appending to the end of a file
-def test_append_to_file(temp_dir, test_file_path):
+def test_append_to_file(tempdir_tool_context, test_file_path):
# Read the original content
- with open(test_file_path, "r") as f:
- original_content = f.read()
+ original_content = tempdir_tool_context.read_file(test_file_path)
# Append new lines
new_lines = "New Line 11\nNew Line 12\n"
result = replace_lines_in_file(test_file_path, 11, 0, "", new_lines)
# Verify the file content
- with open(test_file_path, "r") as f:
- updated_content = f.read()
+ updated_content = tempdir_tool_context.read_file(test_file_path)
assert updated_content == original_content + new_lines
@@ -90,10 +85,9 @@ def test_append_to_file(temp_dir, test_file_path):
# Test inserting at the beginning of an existing file
-def test_insert_at_beginning(test_file_path):
+def test_insert_at_beginning(tempdir_tool_context, test_file_path):
# Read the original content
- with open(test_file_path, "r") as f:
- original_content = f.read()
+ original_content = tempdir_tool_context.read_file(test_file_path)
# Insert new lines at the beginning
new_lines = "New First Line\nNew Second Line\n"
@@ -105,8 +99,7 @@ def test_insert_at_beginning(test_file_path):
assert "3:Line 1\n" in result
# Verify the file content
- with open(test_file_path, "r") as f:
- updated_content = f.read()
+ updated_content = tempdir_tool_context.read_file(test_file_path)
assert updated_content == new_lines + original_content
diff --git a/programmer/tools.py b/programmer/tools.py
index 8c685c6..c5e7766 100644
--- a/programmer/tools.py
+++ b/programmer/tools.py
@@ -4,28 +4,160 @@
import subprocess
import weave
import contextlib
+import shlex
from contextvars import ContextVar
+from contextlib import contextmanager
+from typing import Protocol, Union, TypedDict, Optional
+import requests
LENGTH_LIMIT = 30000
+# TODO:
+# - get rid of resolve_path
+# - must return FileNotFoundError in read_file in Remote
-class ToolContext:
+
+class RunCommandResult(TypedDict):
+ exit_code: int
+ output: str
+
+
+class ToolContext(Protocol):
+ def write_file(self, path: str, content: str) -> None: ...
+
+ def read_file(self, path: str) -> str: ...
+
+ def run_command(self, command: str) -> RunCommandResult: ...
+
+ def resolve_path(self, path: str) -> str: ...
+
+
+class LocalToolContext(ToolContext):
def __init__(self, directory):
self.directory = os.path.abspath(directory)
- def resolve_path(self, path):
+ def write_file(self, path: str, content: str) -> None:
+ full_path = self.resolve_path(path)
+ with open(full_path, "w") as f:
+ f.write(content)
+
+ def read_file(self, path: str) -> str:
+ full_path = self.resolve_path(path)
+ with open(full_path, "r") as f:
+ return f.read()
+
+ def run_command(self, command: str) -> RunCommandResult:
+ completed_process = subprocess.run(
+ command,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ shell=True,
+ cwd=self.directory,
+ )
+ exit_code = completed_process.returncode
+ output = completed_process.stdout.strip()
+
+ return {
+ "exit_code": exit_code,
+ "output": output,
+ }
+
+ def resolve_path(self, path: str) -> str:
return os.path.join(self.directory, path)
+class RemoteContainerToolContext(ToolContext):
+ def __init__(self, base_url: str, directory: str, command_prefix: str):
+ self.base_url = base_url
+ self.container_id = None
+ self.directory = directory
+ self.command_prefix = command_prefix
+
+ @contextmanager
+ def context(self, image_id: str):
+ self.start_container(image_id)
+ try:
+ with tool_context(self):
+ yield
+ finally:
+ self.stop_container()
+
+ def start_container(self, image_id):
+ response = requests.post(
+ f"{self.base_url}/container/start", json={"image_id": image_id}
+ )
+ if response.status_code == 200:
+ self.container_id = response.json().get("container_id")
+ else:
+ print(f"Failed to start container: {response.text}")
+
+ def stop_container(self):
+ response = requests.post(
+ f"{self.base_url}/container/stop",
+ json={"container_id": self.container_id, "delete": True},
+ )
+ if response.status_code == 200:
+ self.container_id = None
+ else:
+ print(f"Failed to stop container: {response.text}")
+
+ def write_file(self, path: str, content: str) -> None:
+ full_path = os.path.join(self.directory, path)
+ response = requests.post(
+ f"{self.base_url}/container/write_file",
+ json={
+ "container_id": self.container_id,
+ "file_path": full_path,
+ "file_content": content,
+ },
+ )
+ if response.status_code != 200:
+ raise Exception(f"Failed to write file: {response.text}")
+
+ def read_file(self, path: str) -> str:
+ full_path = os.path.join(self.directory, path)
+ response = requests.post(
+ f"{self.base_url}/container/read_file",
+ json={"container_id": self.container_id, "file_path": full_path},
+ )
+ if response.status_code == 200:
+ return response.json().get("file_content")
+ else:
+ raise Exception(f"Failed to read file: {response.text}")
+
+ def run_command(self, command: str) -> RunCommandResult:
+ command = self.command_prefix + command
+ command = f"bash -c {shlex.quote(command)}"
+ response = requests.post(
+ f"{self.base_url}/container/run",
+ json={
+ "container_id": self.container_id,
+ "workdir": self.directory,
+ "command": command,
+ },
+ )
+ if response.status_code == 200:
+ json = response.json()
+ return {
+ "exit_code": json["exit_code"],
+ "output": json["output"],
+ }
+ else:
+ raise Exception(f"Failed to run command: {response.text}")
+
+ def resolve_path(self, path: str) -> str:
+ return path # For remote containers, we assume paths are already resolved
+
+
# Create a ContextVar to store the current ToolContext
-current_context: ContextVar[ToolContext | None] = ContextVar(
- "current_context", default=None
-)
+current_context: ContextVar[
+ Optional[Union[LocalToolContext, RemoteContainerToolContext]]
+] = ContextVar("current_context", default=None)
@contextlib.contextmanager
-def tool_context(directory):
- context = ToolContext(directory)
+def tool_context(context: Union[LocalToolContext, RemoteContainerToolContext]):
token = current_context.set(context)
try:
yield context
@@ -33,10 +165,10 @@ def tool_context(directory):
current_context.reset(token)
-def get_current_context():
+def get_current_context() -> Union[LocalToolContext, RemoteContainerToolContext]:
context = current_context.get()
if context is None:
- return ToolContext(".")
+ return LocalToolContext(".")
return context
@@ -95,12 +227,16 @@ def list_files(directory: str) -> str:
The list of files in the directory.
"""
context = get_current_context()
- full_path = context.resolve_path(directory)
- result = json.dumps(os.listdir(full_path))
- if len(result) > LENGTH_LIMIT:
- result = result[:LENGTH_LIMIT]
- result += "\n... (truncated)"
- return result
+ # full_path = context.resolve_path(directory)
+ result = context.run_command(f"ls {directory}")
+ exit_code = result["exit_code"]
+ output = result["output"]
+ if exit_code != 0:
+ raise Exception(f"Failed to list files: {output}")
+ if len(output) > LENGTH_LIMIT:
+ output = output[:LENGTH_LIMIT]
+ output += "\n... (truncated)"
+ return output
@weave.op()
@@ -115,13 +251,14 @@ def write_to_file(path: str, content: str) -> str:
A message indicating whether the file was written successfully.
"""
context = get_current_context()
- full_path = context.resolve_path(path)
- with open(full_path, "w") as f:
- f.write(content)
+ if len(content) > LENGTH_LIMIT:
+ content = content[:LENGTH_LIMIT]
+ content += "\n... (truncated)"
+ context.write_file(path, content)
return "File written successfully."
-@weave.op()
+@weave.op
def read_from_file(path: str) -> str:
"""Read text from a file at the given path.
@@ -132,13 +269,11 @@ def read_from_file(path: str) -> str:
The content of the file.
"""
context = get_current_context()
- full_path = context.resolve_path(path)
- with open(full_path, "r") as f:
- result = f.read()
- if len(result) > LENGTH_LIMIT:
- result = result[:LENGTH_LIMIT]
- result += "\n... (truncated)"
- return result
+ result = context.read_file(path)
+ if len(result) > LENGTH_LIMIT:
+ result = result[:LENGTH_LIMIT]
+ result += "\n... (truncated)"
+ return result
@weave.op()
@@ -152,30 +287,18 @@ def run_command(command: str) -> str:
The output of the command.
"""
context = get_current_context()
- completed_process = subprocess.run(
- command,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- shell=True,
- cwd=context.directory, # Set the working directory for the command
- )
- exit_code = completed_process.returncode
- stdout = completed_process.stdout.strip()
- stderr = completed_process.stderr.strip()
-
- if len(stdout) > LENGTH_LIMIT:
- stdout = stdout[:LENGTH_LIMIT]
- stdout += "\n... (truncated)"
- if len(stderr) > LENGTH_LIMIT:
- stderr = stderr[:LENGTH_LIMIT]
- stderr += "\n... (truncated)"
+ result = context.run_command(command)
+
+ exit_code = result["exit_code"]
+ output = result["output"]
+
+ if len(output) > LENGTH_LIMIT:
+ output = output[:LENGTH_LIMIT]
+ output += "\n... (truncated)"
result = f"Exit code: {exit_code}\n"
- if stderr:
- result += f"STDERR\n{stderr}\n"
- if stdout:
- result += f"STDOUT\n{stdout}\n"
+ if output:
+ result += f"OUTPUT\n{output}\n"
return result
@@ -195,11 +318,8 @@ def read_lines_from_file(file_path: str, start_line: int) -> str:
"""
context = get_current_context()
full_path = context.resolve_path(file_path)
- if not os.path.exists(full_path):
- raise Exception(f"File '{full_path}' does not exist.")
-
- with open(full_path, "r") as file:
- lines = file.readlines()
+ content = context.read_file(full_path)
+ lines = content.splitlines()
if start_line < 1 or start_line > len(lines):
raise Exception("Invalid start_line number.")
@@ -208,7 +328,7 @@ def read_lines_from_file(file_path: str, start_line: int) -> str:
result = ""
for i in range(start_line - 1, end_line - 1):
- result += f"{i + 1}:{lines[i]}"
+ result += f"{i + 1}:{lines[i]}\n"
return result
@@ -238,44 +358,39 @@ def replace_lines_in_file(
"""
context = get_current_context()
full_path = context.resolve_path(file_path)
- lines = []
- if os.path.exists(full_path):
- with open(full_path, "r") as file:
- lines = file.readlines()
+ try:
+ content = context.read_file(full_path)
+ except FileNotFoundError:
+ content = ""
+ lines = content.splitlines()
end_line = start_line + remove_line_count
if start_line < 1 or end_line < start_line or start_line > len(lines) + 1:
raise Exception("Invalid line range.")
- prev_line_split = [l + "\n" for l in previous_lines.splitlines()]
+ prev_line_split = previous_lines.splitlines()
if not lines[start_line - 1 : end_line - 1] == prev_line_split:
raise Exception("Previous lines do not match.")
# Adjust end_line if it exceeds the current number of lines
end_line = min(end_line, len(lines) + 1)
- if not new_lines.endswith("\n"):
- new_lines += "\n"
-
# Convert new_lines string into a list of lines
- new_lines_list = new_lines.splitlines(keepends=True)
+ new_lines_list = new_lines.splitlines()
# Replace the specified line range
lines[start_line - 1 : end_line - 1] = new_lines_list
# Write the modified lines back to the file
- with open(full_path, "w") as file:
- file.writelines(lines)
+ context.write_file(full_path, "\n".join(lines) + "\n")
# Determine the range for the output with a 5-line buffer
output_start = max(start_line - 6, 0)
- output_end = min(
- start_line - 1 + len(new_lines_list) + 6, len(lines)
- ) # Calculate buffer correctly
+ output_end = min(start_line - 1 + len(new_lines_list) + 6, len(lines))
result = ""
for i in range(output_start, output_end):
- result += f"{i + 1}:{lines[i]}"
+ result += f"{i + 1}:{lines[i]}\n"
return result
diff --git a/pyproject.toml b/pyproject.toml
index ddb29a0..1fe20b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ license = { text = "Apache-2.0" }
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
- "weave", "streamlit", "pandas", "litellm"
+ "weave==0.50.15", "streamlit", "pandas", "litellm"
]
[tool.setuptools]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 55b033e..ab309e2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1 +1,7 @@
-pytest
\ No newline at end of file
+setuptools
+wheel
+pytest
+pyright
+fastapi
+docker
+swebench
\ No newline at end of file