Skip to content

Commit f47cff1

Browse files
authored
Update SWE-bench instructions, lint (#20)
1 parent 88467a2 commit f47cff1

File tree

4 files changed

+18
-25
lines changed

4 files changed

+18
-25
lines changed

programmer/evaluate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import weave
44

55
from agent import AgentState
6-
from config import agent
6+
from config import agent_4o_basic
77

88
# Need to serialize AgentState as json for now since we can't save weave Objects
99
# in Dataset set.
@@ -40,7 +40,7 @@ def final_answer_substr(expected_substr: str, model_output: str):
4040

4141
@weave.op()
4242
def model_agent_bridge(state: str):
43-
return agent.run(AgentState(**json.loads(state))).model_dump_json()
43+
return agent_4o_basic.run(AgentState(**json.loads(state))).model_dump_json()
4444

4545

4646
if __name__ == "__main__":

programmer/swebench/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# SWE Bench programmer evaluation
22

3+
This is a custom setup to run fast SWE-bench evals on programmer. The steps are:
4+
- serve swebench docker containers from a remote machine
5+
- setup an x86 machine (I use a gcp e2-standard-32)
6+
- build the swebench instance images. For SWE-bench_Verified this builds about 550 images.
7+
- run [containerserver](../containerserver/README.md) on the machine. containerserver serves an HTTP interface into the Docker containers.
8+
- on your local machine, run python -m programmer.swebench.run_instance or python -m programmer.swebench.evaluate
9+
310
## Build SWE-bench images
411

512
First do setup (below) then run this command to build all the images. --cache_level instance tells the script not to delete the instance images, which are what we want to use with container-manager.
@@ -13,6 +20,10 @@ python -m swebench.harness.run_evaluation \
1320
--cache_level instance
1421
```
1522

23+
## Run containerserver
24+
25+
See [containerserver](../containerserver/README.md) for setup and running containerserver.
26+
1627

1728
## remote machine setup instructions on gcp VM ubuntu 20.04
1829

programmer/swebench/evaluate.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,7 @@
77
from .swebench_model import SWEBenchProgrammerModel
88
from .score import score_swebench
99
from ..agent import Agent
10-
from ..config import SYSTEM_MESSAGE
11-
from ..tools import (
12-
list_files,
13-
run_command,
14-
view_image,
15-
read_lines_from_file,
16-
replace_lines_in_file,
17-
)
10+
from ..config import agent_4o_basic
1811

1912

2013
def load_raw_dataset(name: str, split: str):
@@ -62,22 +55,11 @@ def main():
6255
# ds = load_weave_dataset("SWE-bench_Verified", "test", instance_ids=instance_ids)
6356
ds = load_weave_dataset("SWE-bench_Verified", "test", limit=50, shuffle_seed=42)
6457
eval = weave.Evaluation(
65-
name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=5
58+
name="SWE-bench_Verified", dataset=ds, scorers=[score_swebench], trials=1
6659
)
6760

6861
model = SWEBenchProgrammerModel(
69-
agent=Agent(
70-
model_name="gpt-4o-2024-08-06",
71-
temperature=0.7,
72-
system_message=SYSTEM_MESSAGE,
73-
tools=[
74-
list_files,
75-
run_command,
76-
view_image,
77-
read_lines_from_file,
78-
replace_lines_in_file,
79-
],
80-
),
62+
agent=agent_4o_basic,
8163
max_runtime_seconds=180,
8264
)
8365
res = asyncio.run(eval.evaluate(model))

programmer/swebench/run_instance.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from ..swebench.swebench_model import SWEBenchProgrammerModel
1313
from ..swebench.score import score_swebench
14-
from ..config import agent_replace
14+
from ..config import agent_4o_basic
1515

1616

1717
def main():
@@ -42,7 +42,7 @@ def main():
4242
print("SOLUTION\n", instance["patch"])
4343
print()
4444

45-
model = SWEBenchProgrammerModel(agent=agent_replace)
45+
model = SWEBenchProgrammerModel(agent=agent_4o_basic)
4646
model_output = model.predict(instance)
4747
score = score_swebench(instance, model_output["answer"])
4848
print("SCORE\n", score)

0 commit comments

Comments
 (0)