diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 013d21d..ca60206 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install setuptools wheel pytest pyright
+          pip install -r requirements-dev.txt
           pip install -e .
 
       - name: Run tests
diff --git a/programmer/agent.py b/programmer/agent.py
index 603fa81..bdb40fc 100644
--- a/programmer/agent.py
+++ b/programmer/agent.py
@@ -1,6 +1,7 @@
 from typing import Any, Optional, Union
 from pydantic import Field
 import litellm
+import time
 from openai.types.chat import (
     ChatCompletionMessageParam,
 )
@@ -14,6 +15,10 @@
 from .environment import get_current_environment, EnvironmentSnapshotKey
 
 
+class TimeLimitExceeded(Exception):
+    pass
+
+
 def get_commit_message(history: list[Any]) -> str:
     # Commit message is the most recent message with 'content'
     for i in range(len(history) - 1, -1, -1):
@@ -66,9 +71,10 @@ def step(self, state: AgentState) -> AgentState:
             The new state of the environment.
         """
         Console.step_start("agent", "green")
-        ref = weave.obj_ref(state)
-        if ref:
-            print("state ref:", ref.uri())
+        # Printing this is ugly
+        # ref = weave.obj_ref(state)
+        # if ref:
+        #     print("state ref:", ref.uri())
 
         messages: list[ChatCompletionMessageParam] = [
             {"role": "system", "content": self.system_message},
@@ -124,9 +130,17 @@ def step(self, state: AgentState) -> AgentState:
         return AgentState(history=new_history, env_snapshot_key=snapshot_key)
 
     @weave.op()
-    def run(self, state: AgentState):
+    def run(self, state: AgentState, max_runtime_seconds: int = -1):
+        start_time = time.time()
         while True:
             last_message = state.history[-1]
             if last_message["role"] == "assistant" and "tool_calls" not in last_message:
                 return state
             state = self.step(state)
+            if (
+                max_runtime_seconds > 0
+                and time.time() - start_time > max_runtime_seconds
+            ):
+                raise TimeLimitExceeded(
+                    f"Agent runtime exceeded {max_runtime_seconds}s"
+                )
diff --git a/programmer/containerserver/README.md b/programmer/containerserver/README.md
new file mode 100644
index 0000000..3fc4307
--- /dev/null
+++ b/programmer/containerserver/README.md
@@ -0,0 +1,32 @@
+# Container Manager Server
+
+## Build images on server
+
+We use this for running swe-bench locally against containers on a remote server. See [swe-bench README](../swe-bench/README.md) for steps to build the SWE-bench images.
+
+## Run and check server
+
+put cmserver.py on remote machine
+```
+gcloud compute scp --zone "us-west1-a" --project "weave-support-367421" cmserver.py programmer-benchmark2:~/
+```
+
+on remote machine
+
+(just 1 worker for now, there's global state)
+```
+uvicorn cmserver:app --host 0.0.0.0 --port 8000 --workers 1
+```
+
+tunnel from local machine to remote
+```
+gcloud compute ssh --zone "us-west1-a" "programmer-benchmark" --project "weave-support-367421"  -- -NL 8000:localhost:8000
+```
+
+local machine
+```
+python checkserver.py
+```
+
+result on remote machine should be there are no more running containers when done
+
diff --git a/programmer/containerserver/checkserver.py b/programmer/containerserver/checkserver.py
new file mode 100644
index 0000000..e214180
--- /dev/null
+++ b/programmer/containerserver/checkserver.py
@@ -0,0 +1,129 @@
+import requests
+import threading
+import argparse
+
+# Replace with the actual host and port if different
+BASE_URL = "http://127.0.0.1:8000"
+
+
+def start_container(image_id: str):
+    response = requests.post(f"{BASE_URL}/container/start", json={"image_id": image_id})
+    if response.status_code == 200:
+        return response.json().get("container_id")
+    else:
+        print(f"Failed to start container: {response.text}")
+        return None
+
+
+def run_command(container_id: str, workdir: str, command: str):
+    response = requests.post(
+        f"{BASE_URL}/container/run",
+        json={"container_id": container_id, "workdir": workdir, "command": command},
+    )
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to run command: {response.text}")
+        return None
+
+
+def write_file(container_id: str, file_path: str, file_content: str):
+    response = requests.post(
+        f"{BASE_URL}/container/write_file",
+        json={
+            "container_id": container_id,
+            "file_path": file_path,
+            "file_content": file_content,
+        },
+    )
+    if response.status_code == 200:
+        return response.json().get("status")
+    else:
+        print(f"Failed to write file: {response.text}")
+        return None
+
+
+def read_file(container_id: str, file_path: str):
+    response = requests.post(
+        f"{BASE_URL}/container/read_file",
+        json={"container_id": container_id, "file_path": file_path},
+    )
+    if response.status_code == 200:
+        return response.json().get("file_content")
+    else:
+        print(f"Failed to read file: {response.text}")
+        return None
+
+
+def stop_container(container_id: str, delete: bool):
+    response = requests.post(
+        f"{BASE_URL}/container/stop",
+        json={"container_id": container_id, "delete": delete},
+    )
+    if response.status_code == 200:
+        return response.json().get("status")
+    else:
+        print(f"Failed to stop container: {response.text}")
+        return None
+
+
+def manage_container(image_id: str, container_index: int):
+    print(f"Starting container {container_index}...")
+    container_id = start_container(image_id)
+    if not container_id:
+        print(f"Failed to start container {container_index}")
+        return
+
+    print(f"Started container {container_index} with ID: {container_id}")
+
+    # Run a command inside the container
+    output = run_command(container_id, "/", "ls")
+    if output:
+        print(f"Container {container_index} command output:\n{output}")
+
+    # Write a file inside the container
+    file_path = f"test_{container_index}.txt"
+    file_content = f"Hello, this is a test for container {container_index}."
+    write_status = write_file(container_id, file_path, file_content)
+    if write_status:
+        print(f"Container {container_index} write file status: {write_status}")
+
+    # Read the file back from the container
+    read_content = read_file(container_id, file_path)
+    if read_content:
+        print(f"Container {container_index} file content:\n{read_content}")
+
+    # Stop the container (and delete it)
+    stop_status = stop_container(container_id, delete=True)
+    if stop_status:
+        print(f"Container {container_index} stop status: {stop_status}")
+
+
+def run_parallel_tests(image_id: str, parallelism: int):
+    threads = []
+    for i in range(parallelism):
+        thread = threading.Thread(target=manage_container, args=(image_id, i))
+        threads.append(thread)
+        thread.start()
+
+    for thread in threads:
+        thread.join()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run parallel container tests")
+    parser.add_argument(
+        "--parallelism",
+        type=int,
+        default=1,
+        help="Number of parallel container operations (default: 1)",
+    )
+    parser.add_argument(
+        "--image-id",
+        type=str,
+        default="sweb.eval.x86_64.sympy__sympy-20590",
+        help="Image ID to test",
+    )
+    args = parser.parse_args()
+
+    run_parallel_tests(args.image_id, args.parallelism)
diff --git a/programmer/containerserver/cmserver.py b/programmer/containerserver/cmserver.py
new file mode 100644
index 0000000..2d8dd3e
--- /dev/null
+++ b/programmer/containerserver/cmserver.py
@@ -0,0 +1,178 @@
+import os
+import tarfile
+from io import BytesIO
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+import docker
+from docker.errors import NotFound
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+
+# DockerContainerManager class
+class DockerContainerManager:
+    def __init__(self):
+        self.client = docker.from_env()
+        self.executor = ThreadPoolExecutor()
+
+    async def start_container(self, image_id: str):
+        loop = asyncio.get_event_loop()
+        container = await loop.run_in_executor(
+            self.executor, self._run_container, image_id
+        )
+        return container.short_id
+
+    def _run_container(self, image_id: str):
+        return self.client.containers.run(
+            image_id, detach=True, command="tail -f /dev/null"
+        )
+
+    def _get_container(self, container_id: str):
+        return self.client.containers.get(container_id)
+
+    async def run_command(self, container_id: str, workdir: str, command: str):
+        loop = asyncio.get_event_loop()
+        exec_result = await loop.run_in_executor(
+            self.executor, self._exec_run, container_id, command, workdir
+        )
+        return {
+            "exit_code": exec_result.exit_code,
+            "output": exec_result.output.decode("utf-8"),
+        }
+
+    def _exec_run(self, container_id: str, command: str, workdir: str):
+        container = self._get_container(container_id)
+        return container.exec_run(command, workdir=workdir)
+
+    async def write_file(self, container_id: str, file_path: str, file_content: str):
+        file_path = os.path.join("/", file_path)
+        container = self._get_container(container_id)
+        tarstream = BytesIO()
+        with tarfile.open(fileobj=tarstream, mode="w") as tar:
+            tarinfo = tarfile.TarInfo(name=os.path.basename(file_path))
+            tarinfo.size = len(file_content)
+            tar.addfile(tarinfo, BytesIO(file_content.encode("utf-8")))
+        tarstream.seek(0)
+
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(
+            self.executor,
+            container.put_archive,
+            os.path.dirname(file_path),
+            tarstream,
+        )
+
+    async def read_file(self, container_id: str, file_path: str):
+        container = self._get_container(container_id)
+        loop = asyncio.get_event_loop()
+        bits, _ = await loop.run_in_executor(
+            self.executor, container.get_archive, file_path
+        )
+        file_content = BytesIO()
+        for chunk in bits:
+            file_content.write(chunk)
+        file_content.seek(0)
+        with tarfile.open(fileobj=file_content) as tar:
+            member = tar.getmembers()[0]
+            extract_result = tar.extractfile(member)
+            if extract_result is None:
+                raise Exception(f"Unexpected tar.extractfile result for: {file_path}")
+            file_data = extract_result.read()
+        return file_data.decode("utf-8")
+
+    async def stop_container(self, container_id: str, delete: bool = False):
+        container = self._get_container(container_id)
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(self.executor, container.stop)
+        if delete:
+            await loop.run_in_executor(self.executor, container.remove)
+
+
+# FastAPI setup
+app = FastAPI()
+container_manager = DockerContainerManager()
+
+
+class StartContainerRequest(BaseModel):
+    image_id: str
+
+
+class StopContainerRequest(BaseModel):
+    container_id: str
+    delete: bool
+
+
+class CommandRequest(BaseModel):
+    container_id: str
+    workdir: str
+    command: str
+
+
+class FileRequest(BaseModel):
+    container_id: str
+    file_path: str
+    file_content: str
+
+
+class FilePathRequest(BaseModel):
+    container_id: str
+    file_path: str
+
+
+@app.post("/container/start")
+async def start_container(request: StartContainerRequest):
+    try:
+        container_id = await container_manager.start_container(request.image_id)
+        return {"container_id": container_id}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/run")
+async def run_command(request: CommandRequest):
+    try:
+        result = await container_manager.run_command(
+            request.container_id, request.workdir, request.command
+        )
+        return {"exit_code": result["exit_code"], "output": result["output"]}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/write_file")
+async def write_file(request: FileRequest):
+    try:
+        await container_manager.write_file(
+            request.container_id, request.file_path, request.file_content
+        )
+        return {"status": "file written"}
+    except NotFound as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/read_file")
+async def read_file(request: FilePathRequest):
+    try:
+        file_content = await container_manager.read_file(
+            request.container_id, request.file_path
+        )
+        return {"file_content": file_content}
+    except NotFound as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/container/stop")
+async def stop_container(request: StopContainerRequest):
+    try:
+        await container_manager.stop_container(request.container_id, request.delete)
+        return {"status": "container stopped"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# To run the server, use:
+# uvicorn your_file_name:app --host 0.0.0.0 --port 8000 --workers 4
diff --git a/programmer/evals/eval_repeated_edits.py b/programmer/evals/eval_repeated_edits.py
index 3443a4c..9d63819 100644
--- a/programmer/evals/eval_repeated_edits.py
+++ b/programmer/evals/eval_repeated_edits.py
@@ -10,14 +10,14 @@
 
 from ..agent import AgentState, Agent
 from ..config import agent, agent_claude, agent_claude_replace, agent_replace
-from ..tools import tool_context
+from ..tools import tool_context, LocalToolContext
 
 
 # @pytest.fixture
 @contextmanager
 def tempdir():
     with tempfile.TemporaryDirectory() as dir_:
-        with tool_context(dir_) as tc:
+        with tool_context(LocalToolContext(dir_)) as tc:
             yield tc
 
 
diff --git a/programmer/programmer.py b/programmer/programmer.py
index 739db86..78f9b1a 100644
--- a/programmer/programmer.py
+++ b/programmer/programmer.py
@@ -10,9 +10,9 @@
 
 import weave
 
-from .agent import AgentState, get_commit_message
+from .agent import Agent, AgentState, get_commit_message
 from .console import Console
-from .config import agent
+from .config import agent_replace
 from .environment import (
     environment_session,
     restore_environment,
@@ -34,9 +34,10 @@ def get_user_input():
 @weave.op
 def user_input_step(state: AgentState) -> AgentState:
     Console.step_start("user_input", "purple")
-    ref = weave.obj_ref(state)
-    if ref:
-        print("state ref:", ref.uri())
+    # Printing this is ugly
+    # ref = weave.obj_ref(state)
+    # if ref:
+    #     print("state ref:", ref.uri())
     user_input = get_user_input()
     environment = get_current_environment()
     history = state.history + [
@@ -63,7 +64,7 @@ def make_environment():
 
 
 @weave.op
-def session(agent_state: AgentState):
+def session(agent: Agent, agent_state: AgentState):
     call = weave.get_current_call()
 
     session_id = None
@@ -160,7 +161,7 @@ def main():
         ],
     )
 
-    session(state)
+    session(agent_replace, state)
 
 
 if __name__ == "__main__":
diff --git a/programmer/swe-bench/code.patch b/programmer/swe-bench/code.patch
deleted file mode 100644
index 9d29e02..0000000
--- a/programmer/swe-bench/code.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/sympy/core/numbers.py b/sympy/core/numbers.py
---- a/sympy/core/numbers.py
-+++ b/sympy/core/numbers.py
-@@ -1624,10 +1624,11 @@ def __new__(cls, p, q=None, gcd=None):
- 
-             q = 1
-             gcd = 1
-+        Q = 1
- 
-         if not isinstance(p, SYMPY_INTS):
-             p = Rational(p)
--            q *= p.q
-+            Q *= p.q
-             p = p.p
-         else:
-             p = int(p)
-@@ -1635,9 +1636,10 @@ def __new__(cls, p, q=None, gcd=None):
-         if not isinstance(q, SYMPY_INTS):
-             q = Rational(q)
-             p *= q.q
--            q = q.p
-+            Q *= q.p
-         else:
--            q = int(q)
-+            Q *= int(q)
-+        q = Q
- 
-         # p and q are now ints
-         if q == 0:
diff --git a/programmer/swe-bench/problem.txt b/programmer/swe-bench/problem.txt
deleted file mode 100644
index ecc84ee..0000000
--- a/programmer/swe-bench/problem.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-
-<problem_statement>
-Rational calc value error
-python 3.11, sympy 1.11.1
-when calc Rational('0.5', '100'), the value is 1/100100; but Rational(0.5, 100) the value is 1/200, this value is the true value, and the version of sympy 1.8 is normal
-
-</problem_statement>
diff --git a/programmer/swe-bench/test_code.patch b/programmer/swe-bench/test_code.patch
deleted file mode 100644
index 6fa0c19..0000000
--- a/programmer/swe-bench/test_code.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-diff --git a/sympy/core/tests/test_numbers.py b/sympy/core/tests/test_numbers.py
---- a/sympy/core/tests/test_numbers.py
-+++ b/sympy/core/tests/test_numbers.py
-@@ -366,6 +366,13 @@ def test_Rational_new():
-     assert n.q == 4
-     assert n.p == -2
- 
-+def test_issue_24543():
-+    for p in ('1.5', 1.5, 2):
-+        for q in ('1.5', 1.5, 2):
-+            assert Rational(p, q).as_numer_denom() == Rational('%s/%s'%(p,q)).as_numer_denom()
-+
-+    assert Rational('0.5', '100') == Rational(1, 200)
-+
- 
- def test_Number_new():
-     """"
diff --git a/programmer/swebench/README.md b/programmer/swebench/README.md
new file mode 100644
index 0000000..582e5a7
--- /dev/null
+++ b/programmer/swebench/README.md
@@ -0,0 +1,63 @@
+# SWE Bench programmer evaluation
+
+## Build SWE-bench images
+
+First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager.
+
+```
+python -m swebench.harness.run_evaluation \
+     --predictions_path gold \
+     --max_workers 24 \
+     --run_id validate-gold \
+     --dataset_name princeton-nlp/SWE-bench_Verified \
+     --cache_level instance
+```
+
+
+## remote machine setup instructions on gcp VM ubuntu 20.04
+
+```
+
+sudo snap install docker
+sudo groupadd docker
+sudo usermod -aG docker $USER
+sudo chown root:docker /var/run/docker.sock
+sudo chmod 660 /var/run/docker.sock
+
+sudo apt update
+sudo apt install -y \
+    build-essential \
+    libbz2-dev \
+    libreadline-dev \
+    libssl-dev \
+    zlib1g-dev \
+    libsqlite3-dev \
+    libffi-dev \
+    libncursesw5-dev \
+    libgdbm-dev \
+    liblzma-dev \
+    tk-dev \
+    libdb-dev \
+    libexpat1-dev \
+    libmpdec-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev
+
+# pyenv
+curl https://pyenv.run | bash
+echo 'export PYENV_ROOT="$HOME/.pyenv"
+[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"
+eval "$(pyenv init -)"
+eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc
+## exit and re-log in
+
+pyenv install 3.10.12
+pyenv virtualenv 3.10.12 swe-bench
+
+git clone https://github.com/princeton-nlp/SWE-bench.git
+cd SWE-bench
+pyenv local swe-bench
+pip install -e .
+```
\ No newline at end of file
diff --git a/programmer/swebench/__init__.py b/programmer/swebench/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/programmer/swe-bench/ensembled_annotations_public.csv b/programmer/swebench/data/ensembled_annotations_public.csv
similarity index 100%
rename from programmer/swe-bench/ensembled_annotations_public.csv
rename to programmer/swebench/data/ensembled_annotations_public.csv
diff --git a/programmer/swe-bench/samples_with_3_annotations_public.csv b/programmer/swebench/data/samples_with_3_annotations_public.csv
similarity index 100%
rename from programmer/swe-bench/samples_with_3_annotations_public.csv
rename to programmer/swebench/data/samples_with_3_annotations_public.csv
diff --git a/programmer/swe-bench/swebench-verified.parquet b/programmer/swebench/data/swebench-verified.parquet
similarity index 100%
rename from programmer/swe-bench/swebench-verified.parquet
rename to programmer/swebench/data/swebench-verified.parquet
diff --git a/programmer/swebench/evaluate.py b/programmer/swebench/evaluate.py
new file mode 100644
index 0000000..38a5725
--- /dev/null
+++ b/programmer/swebench/evaluate.py
@@ -0,0 +1,88 @@
+import asyncio
+import pandas as pd
+from typing import Optional
+import random
+import weave
+
+from .swebench_model import SWEBenchProgrammerModel
+from .score import score_swebench
+from ..agent import Agent
+from ..config import SYSTEM_MESSAGE
+from ..tools import (
+    list_files,
+    run_command,
+    view_image,
+    read_lines_from_file,
+    replace_lines_in_file,
+)
+
+
+def load_raw_dataset(name: str, split: str):
+    return pd.read_parquet(
+        f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet"
+    )
+
+
+def load_weave_dataset(
+    name: str,
+    split: str,
+    limit: Optional[int] = None,
+    instance_ids: Optional[list[str]] = None,
+    shuffle_seed: Optional[int] = None,
+):
+    df = load_raw_dataset(name, split)
+
+    data_list = df.to_dict("records")
+    if shuffle_seed is not None:
+        random.seed(shuffle_seed)
+        random.shuffle(data_list)
+    data_list = [
+        r for r in data_list if instance_ids is None or r["instance_id"] in instance_ids
+    ]
+    data_list = data_list[:limit] if limit else data_list
+    data_list = [{"instance": r} for r in data_list]
+
+    return weave.Dataset(name=f"Verified-{limit}-{shuffle_seed}", rows=data_list)  # type: ignore
+
+
+def main():
+    weave.init("weavedev-programmereval1")
+    instance_ids = [
+        "django__django-16569",
+        "django__django-11099",
+        "scikit-learn__scikit-learn-12585",
+        "django__django-13658",
+        "django__django-9296",
+        "astropy__astropy-14309",
+        "django__django-12155",
+        "django__django-16527",
+        "sympy__sympy-24213",
+        "django__django-11066",
+    ]
+    # ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids)
+    ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42)
+    eval = weave.Evaluation(
+        name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5
+    )
+
+    model = SWEBenchProgrammerModel(
+        agent=Agent(
+            model_name="gpt-4o-2024-08-06",
+            temperature=0.7,
+            system_message=SYSTEM_MESSAGE,
+            tools=[
+                list_files,
+                run_command,
+                view_image,
+                read_lines_from_file,
+                replace_lines_in_file,
+            ],
+        ),
+        max_runtime_seconds=180,
+    )
+    res = asyncio.run(eval.evaluate(model))
+    print("RES", res)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programmer/swebench/ingest/README.md b/programmer/swebench/ingest/README.md
new file mode 100644
index 0000000..31b6eee
--- /dev/null
+++ b/programmer/swebench/ingest/README.md
@@ -0,0 +1,3 @@
+# weave swe-bench eval ingestion
+
+these scripts slurp existing https://github.com/swe-bench/experiments results into weave evals
diff --git a/programmer/swebench/ingest/ingest_eval.py b/programmer/swebench/ingest/ingest_eval.py
new file mode 100644
index 0000000..d000e0b
--- /dev/null
+++ b/programmer/swebench/ingest/ingest_eval.py
@@ -0,0 +1,182 @@
+import argparse
+import os
+import sys
+import asyncio
+import json
+import contextvars
+from rich import print
+
+import weave
+from make_dataset import load_weave_dataset
+
+
+context_var = contextvars.ContextVar("context", default={})
+
+
+def load_instance_eval_file(
+    experiments_repo_path, dataset_name, model_name, instance_id, file_name
+):
+    dataset_name_short = dataset_name.split("_")[1].lower()
+    file_path = os.path.join(
+        experiments_repo_path,
+        "evaluation",
+        dataset_name_short,
+        model_name,
+        "logs",
+        instance_id,
+        file_name,
+    )
+    print(f"Loading file: {file_path}")
+
+    if os.path.exists(file_path):
+        with open(file_path, "r") as file:
+            return file.read()
+    else:
+        return None
+
+
+def load_instance_eval_from_logs(
+    experiments_repo_path, dataset_name, model_name, instance_id
+):
+    report_json_file = load_instance_eval_file(
+        experiments_repo_path,
+        dataset_name,
+        model_name,
+        instance_id,
+        "report.json",
+    )
+    report_json = None
+    if report_json_file is not None:
+        report_json = json.loads(report_json_file).get(instance_id)
+    no_report = False
+    if report_json is None:
+        no_report = True
+
+    return {
+        "patch": load_instance_eval_file(
+            experiments_repo_path, dataset_name, model_name, instance_id, "patch.diff"
+        ),
+        "report": report_json,
+        "no_report": no_report,
+    }
+
+
+def load_instance_eval_from_results(
+    experiments_repo_path, dataset_name, model_name, instance_id
+):
+    dataset_name_short = dataset_name.split("_")[1].lower()
+    file_path = os.path.join(
+        experiments_repo_path,
+        "evaluation",
+        dataset_name_short,
+        model_name,
+        "results",
+        "results.json",
+    )
+    with open(file_path, "r") as file:
+        results = json.loads(file.read())
+    summary = {}
+    for k, instance_ids in results.items():
+        summary[k] = instance_id in instance_ids
+
+    return summary
+
+
+class SWEBenchOfflineModel(weave.Model):
+    @weave.op
+    def predict(self, instance_id: str):
+        context = context_var.get()
+        experiments_repo_path = context.get("experiments_repo_path")
+        dataset_name = context.get("dataset_name")
+        return load_instance_eval_from_results(
+            experiments_repo_path, dataset_name, self.name, instance_id
+        )
+
+
+@weave.op
+def score_from_logs(model_output: dict):
+    result = {}
+    if model_output.get("report"):
+        result.update(model_output["report"])
+    result["no_report"] = model_output["no_report"]
+    return result
+
+
+@weave.op
+def score(model_output: dict):
+    return model_output
+
+
+def ingest_eval(experiments_repo_path, dataset_name, model_name):
+    print(f"Ingesting evaluation logs for:")
+    print(f"Dataset: {dataset_name}")
+    print(f"Model: {model_name}")
+    print(f"From repository: {experiments_repo_path}")
+
+    dataset = load_weave_dataset(dataset_name, "test")
+    eval = weave.Evaluation(name=dataset_name, dataset=dataset, scorers=[score])
+
+    context_var.set(
+        {
+            "experiments_repo_path": experiments_repo_path,
+            "dataset_name": dataset_name,
+        }
+    )
+
+    model = SWEBenchOfflineModel(name=model_name)
+    # result, call = asyncio.run(eval.evaluate.call(eval, model))
+    result = asyncio.run(eval.evaluate(model))
+
+    print(result)
+    # call.set_display_name(model_name)
+
+
+def ingest_evals(experiments_repo_path, dataset_name):
+    dataset_name_short = dataset_name.split("_")[1].lower()
+    models_dir = os.path.join(experiments_repo_path, "evaluation", dataset_name_short)
+    for model_name in os.listdir(models_dir):
+        ingest_eval(experiments_repo_path, dataset_name, model_name)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Ingest evaluation logs into Weave.")
+    parser.add_argument(
+        "--experiments_repo_path", help="Path to the experiments repository"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"],
+        default="SWE-bench_Verified",
+        help="Name of the dataset",
+    )
+    parser.add_argument("--model_name", help="Name of the model")
+
+    args = parser.parse_args()
+
+    if not args.experiments_repo_path or not os.path.exists(args.experiments_repo_path):
+        print(
+            f"Error: Experiments repository path does not exist: {args.experiments_repo_path}"
+        )
+        sys.exit(1)
+
+    # Initialize Weave
+    weave.init("weavedev-swebench5")
+
+    if args.model_name:
+        ingest_eval(args.experiments_repo_path, args.dataset_name, args.model_name)
+    else:
+        ingest_evals(args.experiments_repo_path, args.dataset_name)
+
+
+if __name__ == "__main__":
+    main()
+    # from rich import print
+
+    # print(
+    #     load_instance_eval(
+    #         "/Users/shawnlewis/code/experiments",
+    #         "SWE-bench_Verified",
+    #         "20240620_sweagent_claude3.5sonnet",
+    #         "sympy__sympy-24661",
+    #     )
+    # )
diff --git a/programmer/swebench/ingest/make_dataset.py b/programmer/swebench/ingest/make_dataset.py
new file mode 100644
index 0000000..a92fe03
--- /dev/null
+++ b/programmer/swebench/ingest/make_dataset.py
@@ -0,0 +1,76 @@
+import argparse
+import sys
+from typing import Optional
+import pandas as pd
+import weave
+
+
+splits = {
+    "dev": "data/dev-00000-of-00001.parquet",
+    "test": "data/test-00000-of-00001.parquet",
+    "train": "data/train-00000-of-00001.parquet",
+}
+
+
+def load_raw_dataset(name: str, split: str):
+    return pd.read_parquet(
+        f"hf://datasets/princeton-nlp/{name}/data/{split}-00000-of-00001.parquet"
+    )
+
+
+def load_weave_dataset(name: str, split: str, limit: Optional[int] = None):
+    df = load_raw_dataset(name, split)
+
+    data_list = df.to_dict("records")
+    data_list = data_list[:limit] if limit else data_list
+
+    return weave.Dataset(name=f"Verified-{limit}", rows=data_list)  # type: ignore
+
+
+def main(dataset_name="SWE-bench_Verified", split="test"):
+    valid_datasets = ["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"]
+    valid_splits = ["dev", "test", "train"]
+
+    if dataset_name not in valid_datasets:
+        print(f"Error: Invalid dataset name. Choose from {', '.join(valid_datasets)}")
+        sys.exit(1)
+
+    if split not in valid_splits:
+        print(f"Error: Invalid split. Choose from {', '.join(valid_splits)}")
+        sys.exit(1)
+
+    print(f"Creating dataset: {dataset_name}")
+    print(f"Split: {split}")
+
+    weave.init("weavedev-swebench1")
+
+    df = load_raw_dataset(dataset_name, split)
+
+    data_list = df.to_dict("records")
+
+    dataset = weave.Dataset(rows=data_list)  # type: ignore
+
+    weave.publish(dataset, f"{dataset_name}_{split}")
+
+    print(f"Dataset '{dataset_name}_{split}' created and saved successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Create a dataset with specified name and split."
+    )
+    parser.add_argument(
+        "--dataset_name",
+        choices=["SWE-bench", "SWE-bench_Verified", "SWE-bench_Lite"],
+        default="SWE-bench_Verified",
+        help="Name of the dataset to create",
+    )
+    parser.add_argument(
+        "--split",
+        choices=["dev", "test", "train"],
+        default="test",
+        help="Split of the dataset to create",
+    )
+
+    args = parser.parse_args()
+    main(args.dataset_name, args.split)
diff --git a/programmer/swebench/ingest/requirements.txt b/programmer/swebench/ingest/requirements.txt
new file mode 100644
index 0000000..ac11f68
--- /dev/null
+++ b/programmer/swebench/ingest/requirements.txt
@@ -0,0 +1,4 @@
+weave
+pandas
+fsspec
+huggingface_hub
\ No newline at end of file
diff --git a/programmer/swebench/run_instance.py b/programmer/swebench/run_instance.py
new file mode 100644
index 0000000..f8846d5
--- /dev/null
+++ b/programmer/swebench/run_instance.py
@@ -0,0 +1,52 @@
+import os
+import argparse
+import pandas as pd
+
+from rich import print
+
+import weave
+
+from ..weave_next.api import init_local_client
+from ..settings_manager import SettingsManager
+
+from ..swebench.swebench_model import SWEBenchProgrammerModel
+from ..swebench.score import score_swebench
+from ..config import agent_replace
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Programmer")
+    parser.add_argument(
+        "--instance_id", type=str, help="The instance id to run", required=True
+    )
+
+    # Initialize settings
+    SettingsManager.initialize_settings()
+    logging_mode = SettingsManager.get_setting("weave_logging")
+    if logging_mode == "cloud":
+        curdir = os.path.basename(os.path.abspath(os.curdir))
+        weave.init(f"programmer-{curdir}")
+    elif logging_mode == "local":
+        init_local_client(os.path.join(SettingsManager.PROGRAMMER_DIR, "weave.db"))
+
+    args = parser.parse_args()
+
+    df = pd.read_parquet("programmer/swebench/data/swebench-verified.parquet")
+
+    instance_id = args.instance_id
+    instance = df[df["instance_id"] == instance_id].iloc[0]
+    problem_statement = instance["problem_statement"]
+
+    print("PROBLEM STATEMENT\n", problem_statement)
+    print()
+    print("SOLUTION\n", instance["patch"])
+    print()
+
+    model = SWEBenchProgrammerModel(agent=agent_replace)
+    model_output = model.predict(instance)
+    score = score_swebench(instance, model_output["answer"])
+    print("SCORE\n", score)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programmer/swebench/score.py b/programmer/swebench/score.py
new file mode 100644
index 0000000..e9cd799
--- /dev/null
+++ b/programmer/swebench/score.py
@@ -0,0 +1,58 @@
+from typing import Any
+from swebench.harness.test_spec import make_test_spec
+from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
+from swebench.harness.grading import get_eval_tests_report, get_resolution_status
+from swebench.harness.constants import (
+    FAIL_TO_PASS,
+    KEY_INSTANCE_ID,
+    PASS_TO_PASS,
+    ResolvedStatus,
+    SWEbenchInstance,
+)
+
+from ..tools import RemoteContainerToolContext
+
+
+def score_swebench(instance: SWEbenchInstance, model_output):
+    patch = model_output["answer"]
+    tc = RemoteContainerToolContext(
+        "http://localhost:8000",
+        "/testbed",
+        "source /opt/miniconda3/bin/activate && conda activate testbed && ",
+    )
+
+    result: dict[str, Any] = {"patch_successfully_applied": False, "resolved": False}
+
+    ts = make_test_spec(instance)
+    container_id = f"sweb.eval.x86_64.{ts.instance_id}"
+    with tc.context(container_id):
+        print("EVAL SCRIPT\n", ts.eval_script)
+
+        tc.write_file("/tmp/patch.diff", patch)
+        patch_result = tc.run_command("git apply -v /tmp/patch.diff")
+        if patch_result["exit_code"] == 0:
+            result["patch_successfully_applied"] = True
+        print("PATCH RESULT\n", patch_result)
+
+        tc.write_file("/eval.sh", ts.eval_script)
+        test_command_results = tc.run_command("chmod +x /eval.sh && /eval.sh")
+        tc_output = test_command_results["output"]
+
+    repo = "-".join(
+        ts.instance_id.replace("__", "/").split("-")[:-1]
+    )  # e.g. scikit-learn/scikit-learn
+    log_parser = MAP_REPO_TO_PARSER[repo]
+    test_name_to_passfail = log_parser(tc_output)
+
+    eval_ref = {
+        KEY_INSTANCE_ID: ts.instance_id,
+        FAIL_TO_PASS: ts.FAIL_TO_PASS,
+        PASS_TO_PASS: ts.PASS_TO_PASS,
+    }
+
+    report = get_eval_tests_report(test_name_to_passfail, eval_ref)
+    resolved = get_resolution_status(report) == ResolvedStatus.FULL.value
+
+    result.update({"resolved": resolved, "tests_status": report})
+
+    return result
diff --git a/programmer/swebench/scripts/example_v_models.py b/programmer/swebench/scripts/example_v_models.py
new file mode 100644
index 0000000..199b07d
--- /dev/null
+++ b/programmer/swebench/scripts/example_v_models.py
@@ -0,0 +1,56 @@
+# using existing swe-bench results logged to weave (see ingest dir),
+# produce a table with instance_id as rows, and models as columns.
+# useful for finding easy / hard examples
+
+import sys
+import pandas as pd
+
+import weave
+
+from ...weave_next.weave_query import calls
+
+
+def main():
+    if len(sys.argv) > 1:
+        wc = weave.init("weavedev-swebench5")
+        c = calls(wc, "Evaluation.predict_and_score", expand_refs=["inputs.example"])
+        df = c.to_pandas()
+
+        df.to_parquet("verified.parquet", engine="pyarrow")
+    else:
+        df = pd.read_parquet("verified.parquet")
+    # Pivot the dataframe
+    pivot_df = df.pivot(
+        index="inputs.example.instance_id",
+        columns="inputs.model",
+        values="output.model_output.resolved",
+    )
+
+    # Extract model names from the column names
+    pivot_df.columns = pivot_df.columns.str.extract(r"object/(.+):")[0]
+
+    # Count models with resolved True for each instance
+    pivot_df["models_resolved_true"] = pivot_df.apply(lambda row: row.sum(), axis=1)
+
+    # Move the count column to the leftmost position
+    cols = pivot_df.columns.tolist()
+    cols = cols[-1:] + cols[:-1]
+    pivot_df = pivot_df[cols]
+
+    # Sort the pivot table by 'models_resolved_true' in descending order
+    pivot_df = pivot_df.sort_values(by="models_resolved_true", ascending=False)  # type: ignore
+
+    # Sort columns by the model that got the most resolved
+    model_success_count = pivot_df.sum().sort_values(ascending=False)
+    sorted_columns = ["models_resolved_true"] + model_success_count.index.tolist()
+    pivot_df = pivot_df[sorted_columns]
+
+    # Display the first few rows of the resulting table
+    print(pivot_df.head())
+
+    # Optionally, save the pivot table to a new file
+    pivot_df.to_csv("pivot_table.csv")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programmer/swe-bench/swebench-difficulties.py b/programmer/swebench/scripts/verified_difficulty_labels.py
similarity index 97%
rename from programmer/swe-bench/swebench-difficulties.py
rename to programmer/swebench/scripts/verified_difficulty_labels.py
index 6a86e57..9c3068b 100644
--- a/programmer/swe-bench/swebench-difficulties.py
+++ b/programmer/swebench/scripts/verified_difficulty_labels.py
@@ -1,5 +1,6 @@
 # Quick script for viewing swebench examples against
 # annotated difficulties.
+# TODO: update for new file paths (in ../data)
 
 import pandas as pd
 import textwrap
diff --git a/programmer/swebench/swebench_model.py b/programmer/swebench/swebench_model.py
new file mode 100644
index 0000000..53805d6
--- /dev/null
+++ b/programmer/swebench/swebench_model.py
@@ -0,0 +1,41 @@
+import weave
+
+from ..agent import Agent, AgentState, TimeLimitExceeded
+from ..tools import RemoteContainerToolContext
+
+
+class SWEBenchProgrammerModel(weave.Model):
+    agent: Agent
+    max_runtime_seconds: int = 60
+
+    def predict(self, instance):
+        instance_id = instance["instance_id"]
+        problem_statement = instance["problem_statement"]
+        initial_prompt = f"""You are in a checkout of the a git repo. Please identify and fix the issue described in the problem statement.
+
+<problem_statement>
+{problem_statement}
+</problem_statement>"""
+        state = AgentState(
+            history=[
+                {
+                    "role": "user",
+                    "content": initial_prompt,
+                },
+            ],
+        )
+
+        tc = RemoteContainerToolContext(
+            "http://localhost:8000",
+            "/testbed",
+            "source /opt/miniconda3/bin/activate && conda activate testbed && ",
+        )
+        container_id = f"sweb.eval.x86_64.{instance_id}"
+        with tc.context(container_id):
+            try:
+                self.agent.run(state, max_runtime_seconds=self.max_runtime_seconds)
+            except TimeLimitExceeded:
+                return {"errorcode": "runtime", "answer": ""}
+            answer_result = tc.run_command("git diff")
+            answer = answer_result["output"]
+        return {"answer": answer}
diff --git a/programmer/tests/test_file_line_tools.py b/programmer/tests/test_file_line_tools.py
index 16cec22..eec5f8e 100644
--- a/programmer/tests/test_file_line_tools.py
+++ b/programmer/tests/test_file_line_tools.py
@@ -1,22 +1,20 @@
 import os
 import pytest
 from tempfile import TemporaryDirectory
-from programmer.tools import read_lines_from_file, replace_lines_in_file
+from programmer.tools import read_lines_from_file, replace_lines_in_file, LocalToolContext, tool_context, get_current_context
 
 
 @pytest.fixture()
-def temp_dir():
+def tempdir_tool_context():
     with TemporaryDirectory() as tmpdir:
-        yield tmpdir
+        with tool_context(LocalToolContext(tmpdir)) as tc:
+            yield tc
 
 
 @pytest.fixture()
-def test_file_path(temp_dir):
-    file_path = os.path.join(temp_dir, "test_file.txt")
-    with open(file_path, "w") as f:
-        f.write(
-            "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n"
-        )
+def test_file_path(tempdir_tool_context):
+    file_path = "test_file.txt"
+    tempdir_tool_context.write_file(file_path, "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9\nLine 10\n")
     yield file_path
 
 
@@ -39,7 +37,7 @@ def test_read_lines_from_file(test_file_path):
         read_lines_from_file(test_file_path, 11)
 
 
-def test_replace_lines_in_file(temp_dir, test_file_path):
+def test_replace_lines_in_file(test_file_path):
     # Valid replacement
     result = replace_lines_in_file(
         test_file_path,
@@ -54,8 +52,7 @@ def test_replace_lines_in_file(temp_dir, test_file_path):
     assert "10:Line 10\n" in result
 
     # Replacement with a new file
-    new_file_path = os.path.join(temp_dir, "new_test_file.txt")
-    result = replace_lines_in_file(new_file_path, 1, 0, "", "First Line\nSecond Line\n")
+    result = replace_lines_in_file("new_test_file.txt", 1, 0, "", "First Line\nSecond Line\n")
     assert "1:First Line\n" in result
     assert "2:Second Line\n" in result
 
@@ -63,18 +60,16 @@ def test_replace_lines_in_file(temp_dir, test_file_path):
 
 
 # Test appending to the end of a file
-def test_append_to_file(temp_dir, test_file_path):
+def test_append_to_file(tempdir_tool_context, test_file_path):
     # Read the original content
-    with open(test_file_path, "r") as f:
-        original_content = f.read()
+    original_content = tempdir_tool_context.read_file(test_file_path)
 
     # Append new lines
     new_lines = "New Line 11\nNew Line 12\n"
     result = replace_lines_in_file(test_file_path, 11, 0, "", new_lines)
 
     # Verify the file content
-    with open(test_file_path, "r") as f:
-        updated_content = f.read()
+    updated_content = tempdir_tool_context.read_file(test_file_path)
 
     assert updated_content == original_content + new_lines
 
@@ -90,10 +85,9 @@ def test_append_to_file(temp_dir, test_file_path):
 
 
 # Test inserting at the beginning of an existing file
-def test_insert_at_beginning(test_file_path):
+def test_insert_at_beginning(tempdir_tool_context, test_file_path):
     # Read the original content
-    with open(test_file_path, "r") as f:
-        original_content = f.read()
+    original_content = tempdir_tool_context.read_file(test_file_path)
 
     # Insert new lines at the beginning
     new_lines = "New First Line\nNew Second Line\n"
@@ -105,8 +99,7 @@ def test_insert_at_beginning(test_file_path):
     assert "3:Line 1\n" in result
 
     # Verify the file content
-    with open(test_file_path, "r") as f:
-        updated_content = f.read()
+    updated_content = tempdir_tool_context.read_file(test_file_path)
 
     assert updated_content == new_lines + original_content
 
diff --git a/programmer/tools.py b/programmer/tools.py
index 8c685c6..c5e7766 100644
--- a/programmer/tools.py
+++ b/programmer/tools.py
@@ -4,28 +4,160 @@
 import subprocess
 import weave
 import contextlib
+import shlex
 from contextvars import ContextVar
+from contextlib import contextmanager
+from typing import Protocol, Union, TypedDict, Optional
+import requests
 
 LENGTH_LIMIT = 30000
 
+# TODO:
+# - get rid of resolve_path
+# - must return FileNotFoundError in read_file in Remote
 
-class ToolContext:
+
+class RunCommandResult(TypedDict):
+    exit_code: int
+    output: str
+
+
+class ToolContext(Protocol):
+    def write_file(self, path: str, content: str) -> None: ...
+
+    def read_file(self, path: str) -> str: ...
+
+    def run_command(self, command: str) -> RunCommandResult: ...
+
+    def resolve_path(self, path: str) -> str: ...
+
+
+class LocalToolContext(ToolContext):
     def __init__(self, directory):
         self.directory = os.path.abspath(directory)
 
-    def resolve_path(self, path):
+    def write_file(self, path: str, content: str) -> None:
+        full_path = self.resolve_path(path)
+        with open(full_path, "w") as f:
+            f.write(content)
+
+    def read_file(self, path: str) -> str:
+        full_path = self.resolve_path(path)
+        with open(full_path, "r") as f:
+            return f.read()
+
+    def run_command(self, command: str) -> RunCommandResult:
+        completed_process = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            shell=True,
+            cwd=self.directory,
+        )
+        exit_code = completed_process.returncode
+        output = completed_process.stdout.strip()
+
+        return {
+            "exit_code": exit_code,
+            "output": output,
+        }
+
+    def resolve_path(self, path: str) -> str:
         return os.path.join(self.directory, path)
 
 
+class RemoteContainerToolContext(ToolContext):
+    def __init__(self, base_url: str, directory: str, command_prefix: str):
+        self.base_url = base_url
+        self.container_id = None
+        self.directory = directory
+        self.command_prefix = command_prefix
+
+    @contextmanager
+    def context(self, image_id: str):
+        self.start_container(image_id)
+        try:
+            with tool_context(self):
+                yield
+        finally:
+            self.stop_container()
+
+    def start_container(self, image_id):
+        response = requests.post(
+            f"{self.base_url}/container/start", json={"image_id": image_id}
+        )
+        if response.status_code == 200:
+            self.container_id = response.json().get("container_id")
+        else:
+            print(f"Failed to start container: {response.text}")
+
+    def stop_container(self):
+        response = requests.post(
+            f"{self.base_url}/container/stop",
+            json={"container_id": self.container_id, "delete": True},
+        )
+        if response.status_code == 200:
+            self.container_id = None
+        else:
+            print(f"Failed to stop container: {response.text}")
+
+    def write_file(self, path: str, content: str) -> None:
+        full_path = os.path.join(self.directory, path)
+        response = requests.post(
+            f"{self.base_url}/container/write_file",
+            json={
+                "container_id": self.container_id,
+                "file_path": full_path,
+                "file_content": content,
+            },
+        )
+        if response.status_code != 200:
+            raise Exception(f"Failed to write file: {response.text}")
+
+    def read_file(self, path: str) -> str:
+        full_path = os.path.join(self.directory, path)
+        response = requests.post(
+            f"{self.base_url}/container/read_file",
+            json={"container_id": self.container_id, "file_path": full_path},
+        )
+        if response.status_code == 200:
+            return response.json().get("file_content")
+        else:
+            raise Exception(f"Failed to read file: {response.text}")
+
+    def run_command(self, command: str) -> RunCommandResult:
+        command = self.command_prefix + command
+        command = f"bash -c {shlex.quote(command)}"
+        response = requests.post(
+            f"{self.base_url}/container/run",
+            json={
+                "container_id": self.container_id,
+                "workdir": self.directory,
+                "command": command,
+            },
+        )
+        if response.status_code == 200:
+            json = response.json()
+            return {
+                "exit_code": json["exit_code"],
+                "output": json["output"],
+            }
+        else:
+            raise Exception(f"Failed to run command: {response.text}")
+
+    def resolve_path(self, path: str) -> str:
+        return path  # For remote containers, we assume paths are already resolved
+
+
 # Create a ContextVar to store the current ToolContext
-current_context: ContextVar[ToolContext | None] = ContextVar(
-    "current_context", default=None
-)
+current_context: ContextVar[
+    Optional[Union[LocalToolContext, RemoteContainerToolContext]]
+] = ContextVar("current_context", default=None)
 
 
 @contextlib.contextmanager
-def tool_context(directory):
-    context = ToolContext(directory)
+def tool_context(context: Union[LocalToolContext, RemoteContainerToolContext]):
     token = current_context.set(context)
     try:
         yield context
@@ -33,10 +165,10 @@ def tool_context(directory):
         current_context.reset(token)
 
 
-def get_current_context():
+def get_current_context() -> Union[LocalToolContext, RemoteContainerToolContext]:
     context = current_context.get()
     if context is None:
-        return ToolContext(".")
+        return LocalToolContext(".")
     return context
 
 
@@ -95,12 +227,16 @@ def list_files(directory: str) -> str:
         The list of files in the directory.
     """
     context = get_current_context()
-    full_path = context.resolve_path(directory)
-    result = json.dumps(os.listdir(full_path))
-    if len(result) > LENGTH_LIMIT:
-        result = result[:LENGTH_LIMIT]
-        result += "\n... (truncated)"
-    return result
+    # full_path = context.resolve_path(directory)
+    result = context.run_command(f"ls {directory}")
+    exit_code = result["exit_code"]
+    output = result["output"]
+    if exit_code != 0:
+        raise Exception(f"Failed to list files: {output}")
+    if len(output) > LENGTH_LIMIT:
+        output = output[:LENGTH_LIMIT]
+        output += "\n... (truncated)"
+    return output
 
 
 @weave.op()
@@ -115,13 +251,14 @@ def write_to_file(path: str, content: str) -> str:
         A message indicating whether the file was written successfully.
     """
     context = get_current_context()
-    full_path = context.resolve_path(path)
-    with open(full_path, "w") as f:
-        f.write(content)
+    if len(content) > LENGTH_LIMIT:
+        content = content[:LENGTH_LIMIT]
+        content += "\n... (truncated)"
+    context.write_file(path, content)
     return "File written successfully."
 
 
-@weave.op()
+@weave.op
 def read_from_file(path: str) -> str:
     """Read text from a file at the given path.
 
@@ -132,13 +269,11 @@ def read_from_file(path: str) -> str:
         The content of the file.
     """
     context = get_current_context()
-    full_path = context.resolve_path(path)
-    with open(full_path, "r") as f:
-        result = f.read()
-        if len(result) > LENGTH_LIMIT:
-            result = result[:LENGTH_LIMIT]
-            result += "\n... (truncated)"
-        return result
+    result = context.read_file(path)
+    if len(result) > LENGTH_LIMIT:
+        result = result[:LENGTH_LIMIT]
+        result += "\n... (truncated)"
+    return result
 
 
 @weave.op()
@@ -152,30 +287,18 @@ def run_command(command: str) -> str:
         The output of the command.
     """
     context = get_current_context()
-    completed_process = subprocess.run(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-        shell=True,
-        cwd=context.directory,  # Set the working directory for the command
-    )
-    exit_code = completed_process.returncode
-    stdout = completed_process.stdout.strip()
-    stderr = completed_process.stderr.strip()
-
-    if len(stdout) > LENGTH_LIMIT:
-        stdout = stdout[:LENGTH_LIMIT]
-        stdout += "\n... (truncated)"
-    if len(stderr) > LENGTH_LIMIT:
-        stderr = stderr[:LENGTH_LIMIT]
-        stderr += "\n... (truncated)"
+    result = context.run_command(command)
+
+    exit_code = result["exit_code"]
+    output = result["output"]
+
+    if len(output) > LENGTH_LIMIT:
+        output = output[:LENGTH_LIMIT]
+        output += "\n... (truncated)"
 
     result = f"Exit code: {exit_code}\n"
-    if stderr:
-        result += f"STDERR\n{stderr}\n"
-    if stdout:
-        result += f"STDOUT\n{stdout}\n"
+    if output:
+        result += f"OUTPUT\n{output}\n"
     return result
 
 
@@ -195,11 +318,8 @@ def read_lines_from_file(file_path: str, start_line: int) -> str:
     """
     context = get_current_context()
     full_path = context.resolve_path(file_path)
-    if not os.path.exists(full_path):
-        raise Exception(f"File '{full_path}' does not exist.")
-
-    with open(full_path, "r") as file:
-        lines = file.readlines()
+    content = context.read_file(full_path)
+    lines = content.splitlines()
 
     if start_line < 1 or start_line > len(lines):
         raise Exception("Invalid start_line number.")
@@ -208,7 +328,7 @@ def read_lines_from_file(file_path: str, start_line: int) -> str:
     result = ""
 
     for i in range(start_line - 1, end_line - 1):
-        result += f"{i + 1}:{lines[i]}"
+        result += f"{i + 1}:{lines[i]}\n"
 
     return result
 
@@ -238,44 +358,39 @@ def replace_lines_in_file(
     """
     context = get_current_context()
     full_path = context.resolve_path(file_path)
-    lines = []
-    if os.path.exists(full_path):
-        with open(full_path, "r") as file:
-            lines = file.readlines()
+    try:
+        content = context.read_file(full_path)
+    except FileNotFoundError:
+        content = ""
+    lines = content.splitlines()
 
     end_line = start_line + remove_line_count
 
     if start_line < 1 or end_line < start_line or start_line > len(lines) + 1:
         raise Exception("Invalid line range.")
 
-    prev_line_split = [l + "\n" for l in previous_lines.splitlines()]
+    prev_line_split = previous_lines.splitlines()
     if not lines[start_line - 1 : end_line - 1] == prev_line_split:
         raise Exception("Previous lines do not match.")
 
     # Adjust end_line if it exceeds the current number of lines
     end_line = min(end_line, len(lines) + 1)
 
-    if not new_lines.endswith("\n"):
-        new_lines += "\n"
-
     # Convert new_lines string into a list of lines
-    new_lines_list = new_lines.splitlines(keepends=True)
+    new_lines_list = new_lines.splitlines()
 
     # Replace the specified line range
     lines[start_line - 1 : end_line - 1] = new_lines_list
 
     # Write the modified lines back to the file
-    with open(full_path, "w") as file:
-        file.writelines(lines)
+    context.write_file(full_path, "\n".join(lines) + "\n")
 
     # Determine the range for the output with a 5-line buffer
     output_start = max(start_line - 6, 0)
-    output_end = min(
-        start_line - 1 + len(new_lines_list) + 6, len(lines)
-    )  # Calculate buffer correctly
+    output_end = min(start_line - 1 + len(new_lines_list) + 6, len(lines))
     result = ""
 
     for i in range(output_start, output_end):
-        result += f"{i + 1}:{lines[i]}"
+        result += f"{i + 1}:{lines[i]}\n"
 
     return result
diff --git a/pyproject.toml b/pyproject.toml
index ddb29a0..1fe20b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ license = { text = "Apache-2.0" }
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "weave", "streamlit", "pandas", "litellm"
+    "weave==0.50.15", "streamlit", "pandas", "litellm"
 ]
 
 [tool.setuptools]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 55b033e..ab309e2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1 +1,7 @@
-pytest
\ No newline at end of file
+setuptools
+wheel
+pytest
+pyright
+fastapi
+docker
+swebench
\ No newline at end of file