-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathcomputer_use.py
165 lines (134 loc) · 5.02 KB
/
computer_use.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import asyncio
import base64
import logging
from typing import Literal, Union
from playwright.async_api import Browser, Page, Playwright, async_playwright
from agents import (
Agent,
AsyncComputer,
Button,
ComputerTool,
Environment,
ModelSettings,
Runner,
trace,
)
logging.getLogger("openai.agents").setLevel(logging.DEBUG)
logging.getLogger("openai.agents").addHandler(logging.StreamHandler())
async def main():
async with LocalPlaywrightComputer() as computer:
with trace("Computer use example"):
agent = Agent(
name="Browser user",
instructions="You are a helpful agent.",
tools=[ComputerTool(computer)],
# Use the computer using model, and set truncation to auto because its required
model="computer-use-preview",
model_settings=ModelSettings(truncation="auto"),
)
result = await Runner.run(agent, "Search for SF sports news and summarize.")
print(result.final_output)
CUA_KEY_TO_PLAYWRIGHT_KEY = {
"/": "Divide",
"\\": "Backslash",
"alt": "Alt",
"arrowdown": "ArrowDown",
"arrowleft": "ArrowLeft",
"arrowright": "ArrowRight",
"arrowup": "ArrowUp",
"backspace": "Backspace",
"capslock": "CapsLock",
"cmd": "Meta",
"ctrl": "Control",
"delete": "Delete",
"end": "End",
"enter": "Enter",
"esc": "Escape",
"home": "Home",
"insert": "Insert",
"option": "Alt",
"pagedown": "PageDown",
"pageup": "PageUp",
"shift": "Shift",
"space": " ",
"super": "Meta",
"tab": "Tab",
"win": "Meta",
}
class LocalPlaywrightComputer(AsyncComputer):
"""A computer, implemented using a local Playwright browser."""
def __init__(self):
self._playwright: Union[Playwright, None] = None
self._browser: Union[Browser, None] = None
self._page: Union[Page, None] = None
async def _get_browser_and_page(self) -> tuple[Browser, Page]:
width, height = self.dimensions
launch_args = [f"--window-size={width},{height}"]
browser = await self.playwright.chromium.launch(headless=False, args=launch_args)
page = await browser.new_page()
await page.set_viewport_size({"width": width, "height": height})
await page.goto("https://www.bing.com")
return browser, page
async def __aenter__(self):
# Start Playwright and call the subclass hook for getting browser/page
self._playwright = await async_playwright().start()
self._browser, self._page = await self._get_browser_and_page()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self._browser:
await self._browser.close()
if self._playwright:
await self._playwright.stop()
@property
def playwright(self) -> Playwright:
assert self._playwright is not None
return self._playwright
@property
def browser(self) -> Browser:
assert self._browser is not None
return self._browser
@property
def page(self) -> Page:
assert self._page is not None
return self._page
@property
def environment(self) -> Environment:
return "browser"
@property
def dimensions(self) -> tuple[int, int]:
return (1024, 768)
async def screenshot(self) -> str:
"""Capture only the viewport (not full_page)."""
png_bytes = await self.page.screenshot(full_page=False)
return base64.b64encode(png_bytes).decode("utf-8")
async def click(self, x: int, y: int, button: Button = "left") -> None:
playwright_button: Literal["left", "middle", "right"] = "left"
# Playwright only supports left, middle, right buttons
if button in ("left", "right", "middle"):
playwright_button = button # type: ignore
await self.page.mouse.click(x, y, button=playwright_button)
async def double_click(self, x: int, y: int) -> None:
await self.page.mouse.dblclick(x, y)
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
await self.page.mouse.move(x, y)
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
async def type(self, text: str) -> None:
await self.page.keyboard.type(text)
async def wait(self) -> None:
await asyncio.sleep(1)
async def move(self, x: int, y: int) -> None:
await self.page.mouse.move(x, y)
async def keypress(self, keys: list[str]) -> None:
for key in keys:
mapped_key = CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key)
await self.page.keyboard.press(mapped_key)
async def drag(self, path: list[tuple[int, int]]) -> None:
if not path:
return
await self.page.mouse.move(path[0][0], path[0][1])
await self.page.mouse.down()
for px, py in path[1:]:
await self.page.mouse.move(px, py)
await self.page.mouse.up()
if __name__ == "__main__":
asyncio.run(main())