diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 013d21d..ca60206 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,7 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel pytest pyright + pip install -r requirements-dev.txt pip install -e . - name: Run tests diff --git a/programmer/agent.py b/programmer/agent.py index 603fa81..bdb40fc 100644 --- a/programmer/agent.py +++ b/programmer/agent.py @@ -1,6 +1,7 @@ from typing import Any, Optional, Union from pydantic import Field import litellm +import time from openai.types.chat import ( ChatCompletionMessageParam, ) @@ -14,6 +15,10 @@ from .environment import get_current_environment, EnvironmentSnapshotKey +class TimeLimitExceeded(Exception): + pass + + def get_commit_message(history: list[Any]) -> str: # Commit message is the most recent message with 'content' for i in range(len(history) - 1, -1, -1): @@ -66,9 +71,10 @@ def step(self, state: AgentState) -> AgentState: The new state of the environment. """ Console.step_start("agent", "green") - ref = weave.obj_ref(state) - if ref: - print("state ref:", ref.uri()) + # Printing this is ugly + # ref = weave.obj_ref(state) + # if ref: + # print("state ref:", ref.uri()) messages: list[ChatCompletionMessageParam] = [ {"role": "system", "content": self.system_message}, @@ -124,9 +130,17 @@ def step(self, state: AgentState) -> AgentState: return AgentState(history=new_history, env_snapshot_key=snapshot_key) @weave.op() - def run(self, state: AgentState): + def run(self, state: AgentState, max_runtime_seconds: int = -1): + start_time = time.time() while True: last_message = state.history[-1] if last_message["role"] == "assistant" and "tool_calls" not in last_message: return state state = self.step(state) + if ( + max_runtime_seconds > 0 + and time.time() - start_time > max_runtime_seconds + ): + raise TimeLimitExceeded( + f"Agent runtime exceeded {max_runtime_seconds}s" + ) diff --git a/programmer/containerserver/README.md b/programmer/containerserver/README.md new file mode 100644 index 0000000..3fc4307 --- /dev/null +++ b/programmer/containerserver/README.md @@ -0,0 +1,32 @@ +# Container Manager Server + +## Build images on server + +We use this for running swe-bench locally against containers on a remote server. See [swe-bench README](../swe-bench/README.md) for steps to build the SWE-bench images. + +## Run and check server + +put cmserver.py on remote machine +``` +gcloud compute scp --zone "us-west1-a" --project "weave-support-367421" cmserver.py programmer-benchmark2:~/ +``` + +on remote machine + +(just 1 worker for now, there's global state) +``` +uvicorn cmserver:app --host 0.0.0.0 --port 8000 --workers 1 +``` + +tunnel from local machine to remote +``` +gcloud compute ssh --zone "us-west1-a" "programmer-benchmark" --project "weave-support-367421" -- -NL 8000:localhost:8000 +``` + +local machine +``` +python checkserver.py +``` + +result on remote machine should be there are no more running containers when done + diff --git a/programmer/containerserver/checkserver.py b/programmer/containerserver/checkserver.py new file mode 100644 index 0000000..e214180 --- /dev/null +++ b/programmer/containerserver/checkserver.py @@ -0,0 +1,129 @@ +import requests +import threading +import argparse + +# Replace with the actual host and port if different +BASE_URL = "http://127.0.0.1:8000" + + +def start_container(image_id: str): + response = requests.post(f"{BASE_URL}/container/start", json={"image_id": image_id}) + if response.status_code == 200: + return response.json().get("container_id") + else: + print(f"Failed to start container: {response.text}") + return None + + +def run_command(container_id: str, workdir: str, command: str): + response = requests.post( + f"{BASE_URL}/container/run", + json={"container_id": container_id, "workdir": workdir, "command": command}, + ) + if response.status_code == 200: + return response.json() + else: + print(f"Failed to run command: {response.text}") + return None + + +def write_file(container_id: str, file_path: str, file_content: str): + response = requests.post( + f"{BASE_URL}/container/write_file", + json={ + "container_id": container_id, + "file_path": file_path, + "file_content": file_content, + }, + ) + if response.status_code == 200: + return response.json().get("status") + else: + print(f"Failed to write file: {response.text}") + return None + + +def read_file(container_id: str, file_path: str): + response = requests.post( + f"{BASE_URL}/container/read_file", + json={"container_id": container_id, "file_path": file_path}, + ) + if response.status_code == 200: + return response.json().get("file_content") + else: + print(f"Failed to read file: {response.text}") + return None + + +def stop_container(container_id: str, delete: bool): + response = requests.post( + f"{BASE_URL}/container/stop", + json={"container_id": container_id, "delete": delete}, + ) + if response.status_code == 200: + return response.json().get("status") + else: + print(f"Failed to stop container: {response.text}") + return None + + +def manage_container(image_id: str, container_index: int): + print(f"Starting container {container_index}...") + container_id = start_container(image_id) + if not container_id: + print(f"Failed to start container {container_index}") + return + + print(f"Started container {container_index} with ID: {container_id}") + + # Run a command inside the container + output = run_command(container_id, "/", "ls") + if output: + print(f"Container {container_index} command output:\n{output}") + + # Write a file inside the container + file_path = f"test_{container_index}.txt" + file_content = f"Hello, this is a test for container {container_index}." + write_status = write_file(container_id, file_path, file_content) + if write_status: + print(f"Container {container_index} write file status: {write_status}") + + # Read the file back from the container + read_content = read_file(container_id, file_path) + if read_content: + print(f"Container {container_index} file content:\n{read_content}") + + # Stop the container (and delete it) + stop_status = stop_container(container_id, delete=True) + if stop_status: + print(f"Container {container_index} stop status: {stop_status}") + + +def run_parallel_tests(image_id: str, parallelism: int): + threads = [] + for i in range(parallelism): + thread = threading.Thread(target=manage_container, args=(image_id, i)) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run parallel container tests") + parser.add_argument( + "--parallelism", + type=int, + default=1, + help="Number of parallel container operations (default: 1)", + ) + parser.add_argument( + "--image-id", + type=str, + default="sweb.eval.x86_64.sympy__sympy-20590", + help="Image ID to test", + ) + args = parser.parse_args() + + run_parallel_tests(args.image_id, args.parallelism) diff --git a/programmer/containerserver/cmserver.py b/programmer/containerserver/cmserver.py new file mode 100644 index 0000000..2d8dd3e --- /dev/null +++ b/programmer/containerserver/cmserver.py @@ -0,0 +1,178 @@ +import os +import tarfile +from io import BytesIO +from concurrent.futures import ThreadPoolExecutor +import asyncio +import docker +from docker.errors import NotFound +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + + +# DockerContainerManager class +class DockerContainerManager: + def __init__(self): + self.client = docker.from_env() + self.executor = ThreadPoolExecutor() + + async def start_container(self, image_id: str): + loop = asyncio.get_event_loop() + container = await loop.run_in_executor( + self.executor, self._run_container, image_id + ) + return container.short_id + + def _run_container(self, image_id: str): + return self.client.containers.run( + image_id, detach=True, command="tail -f /dev/null" + ) + + def _get_container(self, container_id: str): + return self.client.containers.get(container_id) + + async def run_command(self, container_id: str, workdir: str, command: str): + loop = asyncio.get_event_loop() + exec_result = await loop.run_in_executor( + self.executor, self._exec_run, container_id, command, workdir + ) + return { + "exit_code": exec_result.exit_code, + "output": exec_result.output.decode("utf-8"), + } + + def _exec_run(self, container_id: str, command: str, workdir: str): + container = self._get_container(container_id) + return container.exec_run(command, workdir=workdir) + + async def write_file(self, container_id: str, file_path: str, file_content: str): + file_path = os.path.join("/", file_path) + container = self._get_container(container_id) + tarstream = BytesIO() + with tarfile.open(fileobj=tarstream, mode="w") as tar: + tarinfo = tarfile.TarInfo(name=os.path.basename(file_path)) + tarinfo.size = len(file_content) + tar.addfile(tarinfo, BytesIO(file_content.encode("utf-8"))) + tarstream.seek(0) + + loop = asyncio.get_event_loop() + await loop.run_in_executor( + self.executor, + container.put_archive, + os.path.dirname(file_path), + tarstream, + ) + + async def read_file(self, container_id: str, file_path: str): + container = self._get_container(container_id) + loop = asyncio.get_event_loop() + bits, _ = await loop.run_in_executor( + self.executor, container.get_archive, file_path + ) + file_content = BytesIO() + for chunk in bits: + file_content.write(chunk) + file_content.seek(0) + with tarfile.open(fileobj=file_content) as tar: + member = tar.getmembers()[0] + extract_result = tar.extractfile(member) + if extract_result is None: + raise Exception(f"Unexpected tar.extractfile result for: {file_path}") + file_data = extract_result.read() + return file_data.decode("utf-8") + + async def stop_container(self, container_id: str, delete: bool = False): + container = self._get_container(container_id) + loop = asyncio.get_event_loop() + await loop.run_in_executor(self.executor, container.stop) + if delete: + await loop.run_in_executor(self.executor, container.remove) + + +# FastAPI setup +app = FastAPI() +container_manager = DockerContainerManager() + + +class StartContainerRequest(BaseModel): + image_id: str + + +class StopContainerRequest(BaseModel): + container_id: str + delete: bool + + +class CommandRequest(BaseModel): + container_id: str + workdir: str + command: str + + +class FileRequest(BaseModel): + container_id: str + file_path: str + file_content: str + + +class FilePathRequest(BaseModel): + container_id: str + file_path: str + + +@app.post("/container/start") +async def start_container(request: StartContainerRequest): + try: + container_id = await container_manager.start_container(request.image_id) + return {"container_id": container_id} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/container/run") +async def run_command(request: CommandRequest): + try: + result = await container_manager.run_command( + request.container_id, request.workdir, request.command + ) + return {"exit_code": result["exit_code"], "output": result["output"]} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/container/write_file") +async def write_file(request: FileRequest): + try: + await container_manager.write_file( + request.container_id, request.file_path, request.file_content + ) + return {"status": "file written"} + except NotFound as e: + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/container/read_file") +async def read_file(request: FilePathRequest): + try: + file_content = await container_manager.read_file( + request.container_id, request.file_path + ) + return {"file_content": file_content} + except NotFound as e: + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/container/stop") +async def stop_container(request: StopContainerRequest): + try: + await container_manager.stop_container(request.container_id, request.delete) + return {"status": "container stopped"} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +# To run the server, use: +# uvicorn your_file_name:app --host 0.0.0.0 --port 8000 --workers 4 diff --git a/programmer/evals/eval_repeated_edits.py b/programmer/evals/eval_repeated_edits.py index 3443a4c..9d63819 100644 --- a/programmer/evals/eval_repeated_edits.py +++ b/programmer/evals/eval_repeated_edits.py @@ -10,14 +10,14 @@ from ..agent import AgentState, Agent from ..config import agent, agent_claude, agent_claude_replace, agent_replace -from ..tools import tool_context +from ..tools import tool_context, LocalToolContext # @pytest.fixture @contextmanager def tempdir(): with tempfile.TemporaryDirectory() as dir_: - with tool_context(dir_) as tc: + with tool_context(LocalToolContext(dir_)) as tc: yield tc diff --git a/programmer/programmer.py b/programmer/programmer.py index 739db86..78f9b1a 100644 --- a/programmer/programmer.py +++ b/programmer/programmer.py @@ -10,9 +10,9 @@ import weave -from .agent import AgentState, get_commit_message +from .agent import Agent, AgentState, get_commit_message from .console import Console -from .config import agent +from .config import agent_replace from .environment import ( environment_session, restore_environment, @@ -34,9 +34,10 @@ def get_user_input(): @weave.op def user_input_step(state: AgentState) -> AgentState: Console.step_start("user_input", "purple") - ref = weave.obj_ref(state) - if ref: - print("state ref:", ref.uri()) + # Printing this is ugly + # ref = weave.obj_ref(state) + # if ref: + # print("state ref:", ref.uri()) user_input = get_user_input() environment = get_current_environment() history = state.history + [ @@ -63,7 +64,7 @@ def make_environment(): @weave.op -def session(agent_state: AgentState): +def session(agent: Agent, agent_state: AgentState): call = weave.get_current_call() session_id = None @@ -160,7 +161,7 @@ def main(): ], ) - session(state) + session(agent_replace, state) if __name__ == "__main__": diff --git a/programmer/swe-bench/code.patch b/programmer/swe-bench/code.patch deleted file mode 100644 index 9d29e02..0000000 --- a/programmer/swe-bench/code.patch +++ /dev/null @@ -1,29 +0,0 @@ -diff --git a/sympy/core/numbers.py b/sympy/core/numbers.py ---- a/sympy/core/numbers.py -+++ b/sympy/core/numbers.py -@@ -1624,10 +1624,11 @@ def __new__(cls, p, q=None, gcd=None): - - q = 1 - gcd = 1 -+ Q = 1 - - if not isinstance(p, SYMPY_INTS): - p = Rational(p) -- q *= p.q -+ Q *= p.q - p = p.p - else: - p = int(p) -@@ -1635,9 +1636,10 @@ def __new__(cls, p, q=None, gcd=None): - if not isinstance(q, SYMPY_INTS): - q = Rational(q) - p *= q.q -- q = q.p -+ Q *= q.p - else: -- q = int(q) -+ Q *= int(q) -+ q = Q - - # p and q are now ints - if q == 0: diff --git a/programmer/swe-bench/problem.txt b/programmer/swe-bench/problem.txt deleted file mode 100644 index ecc84ee..0000000 --- a/programmer/swe-bench/problem.txt +++ /dev/null @@ -1,7 +0,0 @@ - - -Rational calc value error -python 3.11, sympy 1.11.1 -when calc Rational('0.5', '100'), the value is 1/100100; but Rational(0.5, 100) the value is 1/200, this value is the true value, and the version of sympy 1.8 is normal - - diff --git a/programmer/swe-bench/test_code.patch b/programmer/swe-bench/test_code.patch deleted file mode 100644 index 6fa0c19..0000000 --- a/programmer/swe-bench/test_code.patch +++ /dev/null @@ -1,17 +0,0 @@ -diff --git a/sympy/core/tests/test_numbers.py b/sympy/core/tests/test_numbers.py ---- a/sympy/core/tests/test_numbers.py -+++ b/sympy/core/tests/test_numbers.py -@@ -366,6 +366,13 @@ def test_Rational_new(): - assert n.q == 4 - assert n.p == -2 - -+def test_issue_24543(): -+ for p in ('1.5', 1.5, 2): -+ for q in ('1.5', 1.5, 2): -+ assert Rational(p, q).as_numer_denom() == Rational('%s/%s'%(p,q)).as_numer_denom() -+ -+ assert Rational('0.5', '100') == Rational(1, 200) -+ - - def test_Number_new(): - """" diff --git a/programmer/swebench/README.md b/programmer/swebench/README.md new file mode 100644 index 0000000..582e5a7 --- /dev/null +++ b/programmer/swebench/README.md @@ -0,0 +1,63 @@ +# SWE Bench programmer evaluation + +## Build SWE-bench images + +First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager. + +``` +python -m swebench.harness.run_evaluation \ + --predictions_path gold \ + --max_workers 24 \ + --run_id validate-gold \ + --dataset_name princeton-nlp/SWE-bench_Verified \ + --cache_level instance +``` + + +## remote machine setup instructions on gcp VM ubuntu 20.04 + +``` + +sudo snap install docker +sudo groupadd docker +sudo usermod -aG docker $USER +sudo chown root:docker /var/run/docker.sock +sudo chmod 660 /var/run/docker.sock + +sudo apt update +sudo apt install -y \ + build-essential \ + libbz2-dev \ + libreadline-dev \ + libssl-dev \ + zlib1g-dev \ + libsqlite3-dev \ + libffi-dev \ + libncursesw5-dev \ + libgdbm-dev \ + liblzma-dev \ + tk-dev \ + libdb-dev \ + libexpat1-dev \ + libmpdec-dev \ + libxml2-dev \ + libxmlsec1-dev \ + libffi-dev \ + liblzma-dev + +# pyenv +curl https://pyenv.run | bash +echo 'export PYENV_ROOT="$HOME/.pyenv" +[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH" +eval "$(pyenv init -)" +eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc +## exit and re-log in + +pyenv install 3.10.12 +pyenv virtualenv 3.10.12 swe-bench + +git clone https://github.com/princeton-nlp/SWE-bench.git +cd SWE-bench +pyenv local swe-bench +pip install -e . +``` \ No newline at end of file diff --git a/programmer/swebench/__init__.py b/programmer/swebench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/programmer/swe-bench/ensembled_annotations_public.csv b/programmer/swebench/data/ensembled_annotations_public.csv similarity index 100% rename from programmer/swe-bench/ensembled_annotations_public.csv rename to programmer/swebench/data/ensembled_annotations_public.csv diff --git a/programmer/swe-bench/samples_with_3_annotations_public.csv b/programmer/swebench/data/samples_with_3_annotations_public.csv similarity index 100% rename from programmer/swe-bench/samples_with_3_annotations_public.csv rename to programmer/swebench/data/samples_with_3_annotations_public.csv diff --git a/programmer/swe-bench/swebench-verified.parquet b/programmer/swebench/data/swebench-verified.parquet similarity index 100% rename from programmer/swe-bench/swebench-verified.parquet rename to programmer/swebench/data/swebench-verified.parquet diff --git a/programmer/swebench/evaluate.py b/programmer/swebench/evaluate.py new file mode 100644 index 0000000..38a5725 --- /dev/null +++ b/programmer/swebench/evaluate.py @@ -0,0 +1,88 @@ +import asyncio +import pandas as pd +from typing import Optional +import random +import weave + +from .swebench_model import SWEBenchProgrammerModel +from .score import score_swebench +from ..agent import Agent +from ..config import SYSTEM_MESSAGE +from ..tools import ( + list_files, + run_command, + view_image, + read_lines_from_file, + replace_lines_in_file, +) + + +def load_raw_dataset(name: str, split: str): + return pd.read_parquet( + f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet" + ) + + +def load_weave_dataset( + name: str, + split: str, + limit: Optional[int] = None, + instance_ids: Optional[list[str]] = None, + shuffle_seed: Optional[int] = None, +): + df = load_raw_dataset(name, split) + + data_list = df.to_dict("records") + if shuffle_seed is not None: + random.seed(shuffle_seed) + random.shuffle(data_list) + data_list = [ + r for r in data_list if instance_ids is None or r["instance_id"] in instance_ids + ] + data_list = data_list[:limit] if limit else data_list + data_list = [{"instance": r} for r in data_list] + + return weave.Dataset(name=f"Verified-{limit}-{shuffle_seed}", rows=data_list) # type: ignore + + +def main(): + weave.init("weavedev-programmereval1") + instance_ids = [ + "django__django-16569", + "django__django-11099", + "scikit-learn__scikit-learn-12585", + "django__django-13658", + "django__django-9296", + "astropy__astropy-14309", + "django__django-12155", + "django__django-16527", + "sympy__sympy-24213", + "django__django-11066", + ] + # ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids) + ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42) + eval = weave.Evaluation( + name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5 + ) + + model = SWEBenchProgrammerModel( + agent=Agent( + model_name="gpt-4o-2024-08-06", + temperature=0.7, + system_message=SYSTEM_MESSAGE, + tools=[ + list_files, + run_command, + view_image, + read_lines_from_file, + replace_lines_in_file, + ], + ), + max_runtime_seconds=180, + ) + res = asyncio.run(eval.evaluate(model)) + print("RES", res) + + +if __name__ == "__main__": + main() diff --git a/programmer/swebench/ingest/README.md b/programmer/swebench/ingest/README.md new file mode 100644 index 0000000..31b6eee --- /dev/null +++ b/programmer/swebench/ingest/README.md @@ -0,0 +1,3 @@ +# weave swe-bench eval ingestion + +these scripts slurp existing https://github.com/swe-bench/experiments results into weave evals diff --git a/programmer/swebench/ingest/ingest_eval.py b/programmer/swebench/ingest/ingest_eval.py new file mode 100644 index 0000000..d000e0b --- /dev/null +++ b/programmer/swebench/ingest/ingest_eval.py @@ -0,0 +1,182 @@ +import argparse +import os +import sys +import asyncio +import json +import contextvars +from rich import print + +import weave +from make_dataset import load_weave_dataset + + +context_var = contextvars.ContextVar("context", default={}) + + +def load_instance_eval_file( + experiments_repo_path, dataset_name, model_name, instance_id, file_name +): + dataset_name_short = dataset_name.split("_")[1].lower() + file_path = os.path.join( + experiments_repo_path, + "evaluation", + dataset_name_short, + model_name, + "logs", + instance_id, + file_name, + ) + print(f"Loading file: {file_path}") + + if os.path.exists(file_path): + with open(file_path, "r") as file: + return file.read() + else: + return None + + +def load_instance_eval_from_logs( + experiments_repo_path, dataset_name, model_name, instance_id +): + report_json_file = load_instance_eval_file( + experiments_repo_path, + dataset_name, + model_name, + instance_id, + "report.json", + ) + report_json = None + if report_json_file is not None: + report_json = json.loads(report_json_file).get(instance_id) + no_report = False + if report_json is None: + no_report = True + + return { + "patch": load_instance_eval_file( + experiments_repo_path, dataset_name, model_name, instance_id, "patch.diff" + ), + "report": report_json, + "no_report": no_report, + } + + +def load_instance_eval_from_results( + experiments_repo_path, dataset_name, model_name, instance_id +): + dataset_name_short = dataset_name.split("_")[1].lower() + file_path = os.path.join( + experiments_repo_path, + "evaluation", + dataset_name_short, + model_name, + "results", + "results.json", + ) + with open(file_path, "r") as file: + results = json.loads(file.read()) + summary = {} + for k, instance_ids in results.items(): + summary[k] = instance_id in instance_ids + + return summary + + +class SWEBenchOfflineModel(weave.Model): + @weave.op + def predict(self, instance_id: str): + context = context_var.get() + experiments_repo_path = context.get("experiments_repo_path") + dataset_name = context.get("dataset_name") + return load_instance_eval_from_results( + experiments_repo_path, dataset_name, self.name, instance_id + ) + + +@weave.op +def score_from_logs(model_output: dict): + result = {} + if model_output.get("report"): + result.update(model_output["report"]) + result["no_report"] = model_output["no_report"] + return result + + +@weave.op +def score(model_output: dict): + return model_output + + +def ingest_eval(experiments_repo_path, dataset_name, model_name): + print(f"Ingesting evaluation logs for:") + print(f"Dataset: {dataset_name}") + print(f"Model: {model_name}") + print(f"From repository: {experiments_repo_path}") + + dataset = load_weave_dataset(dataset_name, "test") + eval = weave.Evaluation(name=dataset_name, dataset=dataset, scorers=[score]) + + context_var.set( + { + "experiments_repo_path": experiments_repo_path, + "dataset_name": dataset_name, + } + ) + + model = SWEBenchOfflineModel(name=model_name) + # result, call = asyncio.run(eval.evaluate.call(eval, model)) + result = asyncio.run(eval.evaluate(model)) + + print(result) + # call.set_display_name(model_name) + + +def ingest_evals(experiments_repo_path, dataset_name): + dataset_name_short = dataset_name.split("_")[1].lower() + models_dir = os.path.join(experiments_repo_path, "evaluation", dataset_name_short) + for model_name in os.listdir(models_dir): + ingest_eval(experiments_repo_path, dataset_name, model_name) + + +def main(): + parser = argparse.ArgumentParser(description="Ingest evaluation logs into Weave.") + parser.add_argument( + "--experiments_repo_path", help="Path to the experiments repository" + ) + parser.add_argument( + "--dataset_name", + choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"], + default="SWE-bench_Verified", + help="Name of the dataset", + ) + parser.add_argument("--model_name", help="Name of the model") + + args = parser.parse_args() + + if not args.experiments_repo_path or not os.path.exists(args.experiments_repo_path): + print( + f"Error: Experiments repository path does not exist: {args.experiments_repo_path}" + ) + sys.exit(1) + + # Initialize Weave + weave.init("weavedev-swebench5") + + if args.model_name: + ingest_eval(args.experiments_repo_path, args.dataset_name, args.model_name) + else: + ingest_evals(args.experiments_repo_path, args.dataset_name) + + +if __name__ == "__main__": + main() + # from rich import print + + # print( + # load_instance_eval( + # "/Users/shawnlewis/code/experiments", + # "SWE-bench_Verified", + # "20240620_sweagent_claude3.5sonnet", + # "sympy__sympy-24661", + # ) + # ) diff --git a/programmer/swebench/ingest/make_dataset.py b/programmer/swebench/ingest/make_dataset.py new file mode 100644 index 0000000..a92fe03 --- /dev/null +++ b/programmer/swebench/ingest/make_dataset.py @@ -0,0 +1,76 @@ +import argparse +import sys +from typing import Optional +import pandas as pd +import weave + + +splits = { + "dev": "data/dev-00000-of-00001.parquet", + "test": "data/test-00000-of-00001.parquet", + "train": "data/train-00000-of-00001.parquet", +} + + +def load_raw_dataset(name: str, split: str): + return pd.read_parquet( + f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet" + ) + + +def load_weave_dataset(name: str, split: str, limit: Optional[int] = None): + df = load_raw_dataset(name, split) + + data_list = df.to_dict("records") + data_list = data_list[:limit] if limit else data_list + + return weave.Dataset(name=f"Verified-{limit}", rows=data_list) # type: ignore + + +def main(dataset_name="SWE-bench_Verified", split="test"): + valid_datasets = ["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"] + valid_splits = ["dev", "test", "train"] + + if dataset_name not in valid_datasets: + print(f"Error: Invalid dataset name. Choose from {', '.join(valid_datasets)}") + sys.exit(1) + + if split not in valid_splits: + print(f"Error: Invalid split. Choose from {', '.join(valid_splits)}") + sys.exit(1) + + print(f"Creating dataset: {dataset_name}") + print(f"Split: {split}") + + weave.init("weavedev-swebench1") + + df = load_raw_dataset(dataset_name, split) + + data_list = df.to_dict("records") + + dataset = weave.Dataset(rows=data_list) # type: ignore + + weave.publish(dataset, f"{dataset_name}_{split}") + + print(f"Dataset '{dataset_name}_{split}' created and saved successfully.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Create a dataset with specified name and split." + ) + parser.add_argument( + "--dataset_name", + choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"], + default="SWE-bench_Verified", + help="Name of the dataset to create", + ) + parser.add_argument( + "--split", + choices=["dev", "test", "train"], + default="test", + help="Split of the dataset to create", + ) + + args = parser.parse_args() + main(args.dataset_name, args.split) diff --git a/programmer/swebench/ingest/requirements.txt b/programmer/swebench/ingest/requirements.txt new file mode 100644 index 0000000..ac11f68 --- /dev/null +++ b/programmer/swebench/ingest/requirements.txt @@ -0,0 +1,4 @@ +weave +pandas +fsspec +huggingface_hub \ No newline at end of file diff --git a/programmer/swebench/run_instance.py b/programmer/swebench/run_instance.py new file mode 100644 index 0000000..f8846d5 --- /dev/null +++ b/programmer/swebench/run_instance.py @@ -0,0 +1,52 @@ +import os +import argparse +import pandas as pd + +from rich import print + +import weave + +from ..weave_next.api import init_local_client +from ..settings_manager import SettingsManager + +from ..swebench.swebench_model import SWEBenchProgrammerModel +from ..swebench.score import score_swebench +from ..config import agent_replace + + +def main(): + parser = argparse.ArgumentParser(description="Programmer") + parser.add_argument( + "--instance_id", type=str, help="The instance id to run", required=True + ) + + # Initialize settings + SettingsManager.initialize_settings() + logging_mode = SettingsManager.get_setting("weave_logging") + if logging_mode == "cloud": + curdir = os.path.basename(os.path.abspath(os.curdir)) + weave.init(f"programmer-{curdir}") + elif logging_mode == "local": + init_local_client(os.path.join(SettingsManager.PROGRAMMER_DIR, "weave.db")) + + args = parser.parse_args() + + df = pd.read_parquet("programmer/swebench/data/swebench-verified.parquet") + + instance_id = args.instance_id + instance = df[df["instance_id"] == instance_id].iloc[0] + problem_statement = instance["problem_statement"] + + print("PROBLEM STATEMENT\n", problem_statement) + print() + print("SOLUTION\n", instance["patch"]) + print() + + model = SWEBenchProgrammerModel(agent=agent_replace) + model_output = model.predict(instance) + score = score_swebench(instance, model_output["answer"]) + print("SCORE\n", score) + + +if __name__ == "__main__": + main() diff --git a/programmer/swebench/score.py b/programmer/swebench/score.py new file mode 100644 index 0000000..e9cd799 --- /dev/null +++ b/programmer/swebench/score.py @@ -0,0 +1,58 @@ +from typing import Any +from swebench.harness.test_spec import make_test_spec +from swebench.harness.log_parsers import MAP_REPO_TO_PARSER +from swebench.harness.grading import get_eval_tests_report, get_resolution_status +from swebench.harness.constants import ( + FAIL_TO_PASS, + KEY_INSTANCE_ID, + PASS_TO_PASS, + ResolvedStatus, + SWEbenchInstance, +) + +from ..tools import RemoteContainerToolContext + + +def score_swebench(instance: SWEbenchInstance, model_output): + patch = model_output["answer"] + tc = RemoteContainerToolContext( + "http://localhost:8000", + "/testbed", + "source /opt/miniconda3/bin/activate && conda activate testbed && ", + ) + + result: dict[str, Any] = {"patch_successfully_applied": False, "resolved": False} + + ts = make_test_spec(instance) + container_id = f"sweb.eval.x86_64.{ts.instance_id}" + with tc.context(container_id): + print("EVAL SCRIPT\n", ts.eval_script) + + tc.write_file("/tmp/patch.diff", patch) + patch_result = tc.run_command("git apply -v /tmp/patch.diff") + if patch_result["exit_code"] == 0: + result["patch_successfully_applied"] = True + print("PATCH RESULT\n", patch_result) + + tc.write_file("/eval.sh", ts.eval_script) + test_command_results = tc.run_command("chmod +x /eval.sh && /eval.sh") + tc_output = test_command_results["output"] + + repo = "-".join( + ts.instance_id.replace("__", "/").split("-")[:-1] + ) # e.g. scikit-learn/scikit-learn + log_parser = MAP_REPO_TO_PARSER[repo] + test_name_to_passfail = log_parser(tc_output) + + eval_ref = { + KEY_INSTANCE_ID: ts.instance_id, + FAIL_TO_PASS: ts.FAIL_TO_PASS, + PASS_TO_PASS: ts.PASS_TO_PASS, + } + + report = get_eval_tests_report(test_name_to_passfail, eval_ref) + resolved = get_resolution_status(report) == ResolvedStatus.FULL.value + + result.update({"resolved": resolved, "tests_status": report}) + + return result diff --git a/programmer/swebench/scripts/example_v_models.py b/programmer/swebench/scripts/example_v_models.py new file mode 100644 index 0000000..199b07d --- /dev/null +++ b/programmer/swebench/scripts/example_v_models.py @@ -0,0 +1,56 @@ +# using existing swe-bench results logged to weave (see ingest dir), +# produce a table with instance_id as rows, and models as columns. +# useful for finding easy / hard examples + +import sys +import pandas as pd + +import weave + +from ...weave_next.weave_query import calls + + +def main(): + if len(sys.argv) > 1: + wc = weave.init("weavedev-swebench5") + c = calls(wc, "Evaluation.predict_and_score", expand_refs=["inputs.example"]) + df = c.to_pandas() + + df.to_parquet("verified.parquet", engine="pyarrow") + else: + df = pd.read_parquet("verified.parquet") + # Pivot the dataframe + pivot_df = df.pivot( + index="inputs.example.instance_id", + columns="inputs.model", + values="output.model_output.resolved", + ) + + # Extract model names from the column names + pivot_df.columns = pivot_df.columns.str.extract(r"object/(.+):")[0] + + # Count models with resolved True for each instance + pivot_df["models_resolved_true"] = pivot_df.apply(lambda row: row.sum(), axis=1) + + # Move the count column to the leftmost position + cols = pivot_df.columns.tolist() + cols = cols[-1:] + cols[:-1] + pivot_df = pivot_df[cols] + + # Sort the pivot table by 'models_resolved_true' in descending order + pivot_df = pivot_df.sort_values(by="models_resolved_true", ascending=False) # type: ignore + + # Sort columns by the model that got the most resolved + model_success_count = pivot_df.sum().sort_values(ascending=False) + sorted_columns = ["models_resolved_true"] + model_success_count.index.tolist() + pivot_df = pivot_df[sorted_columns] + + # Display the first few rows of the resulting table + print(pivot_df.head()) + + # Optionally, save the pivot table to a new file + pivot_df.to_csv("pivot_table.csv") + + +if __name__ == "__main__": + main() diff --git a/programmer/swe-bench/swebench-difficulties.py b/programmer/swebench/scripts/verified_difficulty_labels.py similarity index 97% rename from programmer/swe-bench/swebench-difficulties.py rename to programmer/swebench/scripts/verified_difficulty_labels.py index 6a86e57..9c3068b 100644 --- a/programmer/swe-bench/swebench-difficulties.py +++ b/programmer/swebench/scripts/verified_difficulty_labels.py @@ -1,5 +1,6 @@ # Quick script for viewing swebench examples against # annotated difficulties. +# TODO: update for new file paths (in ../data) import pandas as pd import textwrap diff --git a/programmer/swebench/swebench_model.py b/programmer/swebench/swebench_model.py new file mode 100644 index 0000000..53805d6 --- /dev/null +++ b/programmer/swebench/swebench_model.py @@ -0,0 +1,41 @@ +import weave + +from ..agent import Agent, AgentState, TimeLimitExceeded +from ..tools import RemoteContainerToolContext + + +class SWEBenchProgrammerModel(weave.Model): + agent: Agent + max_runtime_seconds: int = 60 + + def predict(self, instance): + instance_id = instance["instance_id"] + problem_statement = instance["problem_statement"] + initial_prompt = f"""You are in a checkout of the a git repo. Please identify and fix the issue described in the problem statement. + + +{problem_statement} +""" + state = AgentState( + history=[ + { + "role": "user", + "content": initial_prompt, + }, + ], + ) + + tc = RemoteContainerToolContext( + "http://localhost:8000", + "/testbed", + "source /opt/miniconda3/bin/activate && conda activate testbed && ", + ) + container_id = f"sweb.eval.x86_64.{instance_id}" + with tc.context(container_id): + try: + self.agent.run(state, max_runtime_seconds=self.max_runtime_seconds) + except TimeLimitExceeded: + return {"errorcode": "runtime", "answer": ""} + answer_result = tc.run_command("git diff") + answer = answer_result["output"] + return {"answer": answer} diff --git a/programmer/tests/test_file_line_tools.py b/programmer/tests/test_file_line_tools.py index 16cec22..eec5f8e 100644 --- a/programmer/tests/test_file_line_tools.py +++ b/programmer/tests/test_file_line_tools.py @@ -1,22 +1,20 @@ import os import pytest from tempfile import TemporaryDirectory -from programmer.tools import read_lines_from_file, replace_lines_in_file +from programmer.tools import read_lines_from_file, replace_lines_in_file, LocalToolContext, tool_context, get_current_context @pytest.fixture() -def temp_dir(): +def tempdir_tool_context(): with TemporaryDirectory() as tmpdir: - yield tmpdir + with tool_context(LocalToolContext(tmpdir)) as tc: + yield tc @pytest.fixture() -def test_file_path(temp_dir): - file_path = os.path.join(temp_dir, "test_file.txt") - with open(file_path, "w") as f: - f.write( - "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n" - ) +def test_file_path(tempdir_tool_context): + file_path = "test_file.txt" + tempdir_tool_context.write_file(file_path, "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n") yield file_path @@ -39,7 +37,7 @@ def test_read_lines_from_file(test_file_path): read_lines_from_file(test_file_path, 11) -def test_replace_lines_in_file(temp_dir, test_file_path): +def test_replace_lines_in_file(test_file_path): # Valid replacement result = replace_lines_in_file( test_file_path, @@ -54,8 +52,7 @@ def test_replace_lines_in_file(temp_dir, test_file_path): assert "10:Line 10\n" in result # Replacement with a new file - new_file_path = os.path.join(temp_dir, "new_test_file.txt") - result = replace_lines_in_file(new_file_path, 1, 0, "", "First Line\nSecond Line\n") + result = replace_lines_in_file("new_test_file.txt", 1, 0, "", "First Line\nSecond Line\n") assert "1:First Line\n" in result assert "2:Second Line\n" in result @@ -63,18 +60,16 @@ def test_replace_lines_in_file(temp_dir, test_file_path): # Test appending to the end of a file -def test_append_to_file(temp_dir, test_file_path): +def test_append_to_file(tempdir_tool_context, test_file_path): # Read the original content - with open(test_file_path, "r") as f: - original_content = f.read() + original_content = tempdir_tool_context.read_file(test_file_path) # Append new lines new_lines = "New Line 11\nNew Line 12\n" result = replace_lines_in_file(test_file_path, 11, 0, "", new_lines) # Verify the file content - with open(test_file_path, "r") as f: - updated_content = f.read() + updated_content = tempdir_tool_context.read_file(test_file_path) assert updated_content == original_content + new_lines @@ -90,10 +85,9 @@ def test_append_to_file(temp_dir, test_file_path): # Test inserting at the beginning of an existing file -def test_insert_at_beginning(test_file_path): +def test_insert_at_beginning(tempdir_tool_context, test_file_path): # Read the original content - with open(test_file_path, "r") as f: - original_content = f.read() + original_content = tempdir_tool_context.read_file(test_file_path) # Insert new lines at the beginning new_lines = "New First Line\nNew Second Line\n" @@ -105,8 +99,7 @@ def test_insert_at_beginning(test_file_path): assert "3:Line 1\n" in result # Verify the file content - with open(test_file_path, "r") as f: - updated_content = f.read() + updated_content = tempdir_tool_context.read_file(test_file_path) assert updated_content == new_lines + original_content diff --git a/programmer/tools.py b/programmer/tools.py index 8c685c6..c5e7766 100644 --- a/programmer/tools.py +++ b/programmer/tools.py @@ -4,28 +4,160 @@ import subprocess import weave import contextlib +import shlex from contextvars import ContextVar +from contextlib import contextmanager +from typing import Protocol, Union, TypedDict, Optional +import requests LENGTH_LIMIT = 30000 +# TODO: +# - get rid of resolve_path +# - must return FileNotFoundError in read_file in Remote -class ToolContext: + +class RunCommandResult(TypedDict): + exit_code: int + output: str + + +class ToolContext(Protocol): + def write_file(self, path: str, content: str) -> None: ... + + def read_file(self, path: str) -> str: ... + + def run_command(self, command: str) -> RunCommandResult: ... + + def resolve_path(self, path: str) -> str: ... + + +class LocalToolContext(ToolContext): def __init__(self, directory): self.directory = os.path.abspath(directory) - def resolve_path(self, path): + def write_file(self, path: str, content: str) -> None: + full_path = self.resolve_path(path) + with open(full_path, "w") as f: + f.write(content) + + def read_file(self, path: str) -> str: + full_path = self.resolve_path(path) + with open(full_path, "r") as f: + return f.read() + + def run_command(self, command: str) -> RunCommandResult: + completed_process = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + shell=True, + cwd=self.directory, + ) + exit_code = completed_process.returncode + output = completed_process.stdout.strip() + + return { + "exit_code": exit_code, + "output": output, + } + + def resolve_path(self, path: str) -> str: return os.path.join(self.directory, path) +class RemoteContainerToolContext(ToolContext): + def __init__(self, base_url: str, directory: str, command_prefix: str): + self.base_url = base_url + self.container_id = None + self.directory = directory + self.command_prefix = command_prefix + + @contextmanager + def context(self, image_id: str): + self.start_container(image_id) + try: + with tool_context(self): + yield + finally: + self.stop_container() + + def start_container(self, image_id): + response = requests.post( + f"{self.base_url}/container/start", json={"image_id": image_id} + ) + if response.status_code == 200: + self.container_id = response.json().get("container_id") + else: + print(f"Failed to start container: {response.text}") + + def stop_container(self): + response = requests.post( + f"{self.base_url}/container/stop", + json={"container_id": self.container_id, "delete": True}, + ) + if response.status_code == 200: + self.container_id = None + else: + print(f"Failed to stop container: {response.text}") + + def write_file(self, path: str, content: str) -> None: + full_path = os.path.join(self.directory, path) + response = requests.post( + f"{self.base_url}/container/write_file", + json={ + "container_id": self.container_id, + "file_path": full_path, + "file_content": content, + }, + ) + if response.status_code != 200: + raise Exception(f"Failed to write file: {response.text}") + + def read_file(self, path: str) -> str: + full_path = os.path.join(self.directory, path) + response = requests.post( + f"{self.base_url}/container/read_file", + json={"container_id": self.container_id, "file_path": full_path}, + ) + if response.status_code == 200: + return response.json().get("file_content") + else: + raise Exception(f"Failed to read file: {response.text}") + + def run_command(self, command: str) -> RunCommandResult: + command = self.command_prefix + command + command = f"bash -c {shlex.quote(command)}" + response = requests.post( + f"{self.base_url}/container/run", + json={ + "container_id": self.container_id, + "workdir": self.directory, + "command": command, + }, + ) + if response.status_code == 200: + json = response.json() + return { + "exit_code": json["exit_code"], + "output": json["output"], + } + else: + raise Exception(f"Failed to run command: {response.text}") + + def resolve_path(self, path: str) -> str: + return path # For remote containers, we assume paths are already resolved + + # Create a ContextVar to store the current ToolContext -current_context: ContextVar[ToolContext | None] = ContextVar( - "current_context", default=None -) +current_context: ContextVar[ + Optional[Union[LocalToolContext, RemoteContainerToolContext]] +] = ContextVar("current_context", default=None) @contextlib.contextmanager -def tool_context(directory): - context = ToolContext(directory) +def tool_context(context: Union[LocalToolContext, RemoteContainerToolContext]): token = current_context.set(context) try: yield context @@ -33,10 +165,10 @@ def tool_context(directory): current_context.reset(token) -def get_current_context(): +def get_current_context() -> Union[LocalToolContext, RemoteContainerToolContext]: context = current_context.get() if context is None: - return ToolContext(".") + return LocalToolContext(".") return context @@ -95,12 +227,16 @@ def list_files(directory: str) -> str: The list of files in the directory. """ context = get_current_context() - full_path = context.resolve_path(directory) - result = json.dumps(os.listdir(full_path)) - if len(result) > LENGTH_LIMIT: - result = result[:LENGTH_LIMIT] - result += "\n... (truncated)" - return result + # full_path = context.resolve_path(directory) + result = context.run_command(f"ls {directory}") + exit_code = result["exit_code"] + output = result["output"] + if exit_code != 0: + raise Exception(f"Failed to list files: {output}") + if len(output) > LENGTH_LIMIT: + output = output[:LENGTH_LIMIT] + output += "\n... (truncated)" + return output @weave.op() @@ -115,13 +251,14 @@ def write_to_file(path: str, content: str) -> str: A message indicating whether the file was written successfully. """ context = get_current_context() - full_path = context.resolve_path(path) - with open(full_path, "w") as f: - f.write(content) + if len(content) > LENGTH_LIMIT: + content = content[:LENGTH_LIMIT] + content += "\n... (truncated)" + context.write_file(path, content) return "File written successfully." -@weave.op() +@weave.op def read_from_file(path: str) -> str: """Read text from a file at the given path. @@ -132,13 +269,11 @@ def read_from_file(path: str) -> str: The content of the file. """ context = get_current_context() - full_path = context.resolve_path(path) - with open(full_path, "r") as f: - result = f.read() - if len(result) > LENGTH_LIMIT: - result = result[:LENGTH_LIMIT] - result += "\n... (truncated)" - return result + result = context.read_file(path) + if len(result) > LENGTH_LIMIT: + result = result[:LENGTH_LIMIT] + result += "\n... (truncated)" + return result @weave.op() @@ -152,30 +287,18 @@ def run_command(command: str) -> str: The output of the command. """ context = get_current_context() - completed_process = subprocess.run( - command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - shell=True, - cwd=context.directory, # Set the working directory for the command - ) - exit_code = completed_process.returncode - stdout = completed_process.stdout.strip() - stderr = completed_process.stderr.strip() - - if len(stdout) > LENGTH_LIMIT: - stdout = stdout[:LENGTH_LIMIT] - stdout += "\n... (truncated)" - if len(stderr) > LENGTH_LIMIT: - stderr = stderr[:LENGTH_LIMIT] - stderr += "\n... (truncated)" + result = context.run_command(command) + + exit_code = result["exit_code"] + output = result["output"] + + if len(output) > LENGTH_LIMIT: + output = output[:LENGTH_LIMIT] + output += "\n... (truncated)" result = f"Exit code: {exit_code}\n" - if stderr: - result += f"STDERR\n{stderr}\n" - if stdout: - result += f"STDOUT\n{stdout}\n" + if output: + result += f"OUTPUT\n{output}\n" return result @@ -195,11 +318,8 @@ def read_lines_from_file(file_path: str, start_line: int) -> str: """ context = get_current_context() full_path = context.resolve_path(file_path) - if not os.path.exists(full_path): - raise Exception(f"File '{full_path}' does not exist.") - - with open(full_path, "r") as file: - lines = file.readlines() + content = context.read_file(full_path) + lines = content.splitlines() if start_line < 1 or start_line > len(lines): raise Exception("Invalid start_line number.") @@ -208,7 +328,7 @@ def read_lines_from_file(file_path: str, start_line: int) -> str: result = "" for i in range(start_line - 1, end_line - 1): - result += f"{i + 1}:{lines[i]}" + result += f"{i + 1}:{lines[i]}\n" return result @@ -238,44 +358,39 @@ def replace_lines_in_file( """ context = get_current_context() full_path = context.resolve_path(file_path) - lines = [] - if os.path.exists(full_path): - with open(full_path, "r") as file: - lines = file.readlines() + try: + content = context.read_file(full_path) + except FileNotFoundError: + content = "" + lines = content.splitlines() end_line = start_line + remove_line_count if start_line < 1 or end_line < start_line or start_line > len(lines) + 1: raise Exception("Invalid line range.") - prev_line_split = [l + "\n" for l in previous_lines.splitlines()] + prev_line_split = previous_lines.splitlines() if not lines[start_line - 1 : end_line - 1] == prev_line_split: raise Exception("Previous lines do not match.") # Adjust end_line if it exceeds the current number of lines end_line = min(end_line, len(lines) + 1) - if not new_lines.endswith("\n"): - new_lines += "\n" - # Convert new_lines string into a list of lines - new_lines_list = new_lines.splitlines(keepends=True) + new_lines_list = new_lines.splitlines() # Replace the specified line range lines[start_line - 1 : end_line - 1] = new_lines_list # Write the modified lines back to the file - with open(full_path, "w") as file: - file.writelines(lines) + context.write_file(full_path, "\n".join(lines) + "\n") # Determine the range for the output with a 5-line buffer output_start = max(start_line - 6, 0) - output_end = min( - start_line - 1 + len(new_lines_list) + 6, len(lines) - ) # Calculate buffer correctly + output_end = min(start_line - 1 + len(new_lines_list) + 6, len(lines)) result = "" for i in range(output_start, output_end): - result += f"{i + 1}:{lines[i]}" + result += f"{i + 1}:{lines[i]}\n" return result diff --git a/pyproject.toml b/pyproject.toml index ddb29a0..1fe20b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ license = { text = "Apache-2.0" } readme = "README.md" requires-python = ">=3.10" dependencies = [ - "weave", "streamlit", "pandas", "litellm" + "weave==0.50.15", "streamlit", "pandas", "litellm" ] [tool.setuptools] diff --git a/requirements-dev.txt b/requirements-dev.txt index 55b033e..ab309e2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,7 @@ -pytest \ No newline at end of file +setuptools +wheel +pytest +pyright +fastapi +docker +swebench \ No newline at end of file