Skip to content

Commit 51d9993

Browse files
authored
Merge pull request #57 from klxu03/reflective-mouse-click
Added -accurate, reflective mouse click mode
2 parents 1b70027 + 12997a3 commit 51d9993

File tree

3 files changed

+168
-10
lines changed

3 files changed

+168
-10
lines changed

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,10 @@ cython_debug/
165165
screenshot.png
166166
screenshot_with_grid.png
167167
screenshot_with_labeled_grid.png
168+
screenshot_mini.png
169+
screenshot_mini_with_grid.png
168170
grid_screenshot.png
169171
grid_reflection_screenshot.png
170172
reflection_screenshot.png
171-
summary_screenshot.png
173+
summary_screenshot.png
174+
operate/screenshots/

README.md

+7
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,15 @@ git clone https://github.com/OthersideAI/self-operating-computer.git
4646
cd self-operating-computer
4747
```
4848

49+
3. **Create a Python virtual environment with Poetry**.
50+
51+
```
52+
cat requirements.txt | xargs poetry add
53+
```
54+
4955
3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html).
5056

57+
5158
```
5259
python3 -m venv venv
5360
```

operate/main.py

+157-9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import argparse
1313
import platform
1414
import Xlib.display
15+
import Xlib.X
16+
import Xlib.Xutil # not sure if Xutil is necessary
1517

1618
from prompt_toolkit import prompt
1719
from prompt_toolkit.shortcuts import message_dialog
@@ -31,6 +33,11 @@
3133
client.api_key = os.getenv("OPENAI_API_KEY")
3234
client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
3335

36+
monitor_size = {
37+
"width": 1920,
38+
"height": 1080,
39+
}
40+
3441
VISION_PROMPT = """
3542
You are a Self-Operating Computer. You use the same operating system as a human.
3643
@@ -46,7 +53,8 @@
4653
Here are the response formats below.
4754
4855
1. CLICK
49-
Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
56+
Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
57+
Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%"
5058
5159
2. TYPE
5260
Response: TYPE "value you want to type"
@@ -88,6 +96,19 @@
8896
Objective: {objective}
8997
"""
9098

99+
ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100+
ACCURATE_MODE_VISION_PROMPT = """
101+
It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102+
As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
103+
This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.
104+
105+
If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer.
106+
Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer.
107+
108+
There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.
109+
110+
Please use this context as additional info to further refine the "percent" location in the CLICK action!
111+
"""
91112

92113
USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
93114

@@ -171,7 +192,7 @@ def supports_ansi():
171192
ANSI_BRIGHT_MAGENTA = ""
172193

173194

174-
def main(model):
195+
def main(model, accurate_mode):
175196
"""
176197
Main function for the Self-Operating Computer
177198
"""
@@ -209,7 +230,7 @@ def main(model):
209230
if DEBUG:
210231
print("[loop] messages before next action:\n\n\n", messages[1:])
211232
try:
212-
response = get_next_action(model, messages, objective)
233+
response = get_next_action(model, messages, objective, accurate_mode)
213234
action = parse_oai_response(response)
214235
action_type = action.get("type")
215236
action_detail = action.get("data")
@@ -291,9 +312,19 @@ def format_vision_prompt(objective, previous_action):
291312
return prompt
292313

293314

294-
def get_next_action(model, messages, objective):
315+
def format_accurate_mode_vision_prompt(prev_x, prev_y):
316+
"""
317+
Format the accurate mode vision prompt
318+
"""
319+
width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
320+
height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
321+
prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
322+
return prompt
323+
324+
325+
def get_next_action(model, messages, objective, accurate_mode):
295326
if model == "gpt-4-vision-preview":
296-
content = get_next_action_from_openai(messages, objective)
327+
content = get_next_action_from_openai(messages, objective, accurate_mode)
297328
return content
298329
elif model == "agent-1":
299330
return "coming soon"
@@ -314,8 +345,56 @@ def get_last_assistant_message(messages):
314345
return messages[index]
315346
return None # Return None if no assistant message is found
316347

348+
def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
349+
"""
350+
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351+
"""
352+
try:
353+
screenshot_filename = os.path.join(
354+
"screenshots", "screenshot_mini.png"
355+
)
356+
capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
357+
358+
new_screenshot_filename = os.path.join(
359+
"screenshots", "screenshot_mini_with_grid.png"
360+
)
361+
362+
with open(new_screenshot_filename, "rb") as img_file:
363+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
364+
365+
accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y)
317366

318-
def get_next_action_from_openai(messages, objective):
367+
accurate_mode_message = {
368+
"role": "user",
369+
"content": [
370+
{"type": "text", "text": accurate_vision_prompt},
371+
{
372+
"type": "image_url",
373+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
374+
},
375+
],
376+
}
377+
378+
pseudo_messages.append(accurate_mode_message)
379+
380+
response = client.chat.completions.create(
381+
model="gpt-4-vision-preview",
382+
messages=pseudo_messages,
383+
presence_penalty=1,
384+
frequency_penalty=1,
385+
temperature=0.7,
386+
max_tokens=300,
387+
)
388+
389+
content = response.choices[0].message.content
390+
391+
return content
392+
except Exception as e:
393+
print(f"Error reprompting model for accurate_mode: {e}")
394+
return "ERROR"
395+
396+
397+
def get_next_action_from_openai(messages, objective, accurate_mode):
319398
"""
320399
Get the next action for Self-Operating Computer
321400
"""
@@ -355,6 +434,7 @@ def get_next_action_from_openai(messages, objective):
355434
},
356435
],
357436
}
437+
358438
# create a copy of messages and save to pseudo_messages
359439
pseudo_messages = messages.copy()
360440
pseudo_messages.append(vision_message)
@@ -374,7 +454,23 @@ def get_next_action_from_openai(messages, objective):
374454
"content": "`screenshot.png`",
375455
}
376456
)
457+
377458
content = response.choices[0].message.content
459+
460+
if accurate_mode:
461+
if content.startswith("CLICK"):
462+
# Adjust pseudo_messages to include the accurate_mode_message
463+
464+
click_data = re.search(r"CLICK \{ (.+) \}", content).group(1)
465+
click_data_json = json.loads(f"{{{click_data}}}")
466+
prev_x = click_data_json["x"]
467+
prev_y = click_data_json["y"]
468+
469+
if DEBUG:
470+
print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
471+
content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
472+
assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
473+
378474
return content
379475

380476
except Exception as e:
@@ -445,7 +541,6 @@ def summarize(messages, objective):
445541
print(f"Error parsing JSON: {e}")
446542
return "Failed to summarize the workflow"
447543

448-
449544
def mouse_click(click_detail):
450545
try:
451546
x = convert_percent_to_decimal(click_detail["x"])
@@ -575,7 +670,51 @@ def search(text):
575670
return "Open program: " + text
576671

577672

673+
def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
674+
user_platform = platform.system()
675+
676+
if user_platform == "Linux":
677+
x = float(x[:-1]) # convert x from "50%" to 50.
678+
y = float(y[:-1])
679+
680+
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
681+
y = (y/100) * monitor_size['height']
682+
683+
# Define the coordinates for the rectangle
684+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
685+
x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
686+
687+
screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
688+
screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
689+
screenshot.save(file_path)
690+
691+
screenshots_dir = "screenshots"
692+
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
693+
694+
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
695+
elif user_platform == "Darwin":
696+
x = float(x[:-1]) # convert x from "50%" to 50.
697+
y = float(y[:-1])
698+
699+
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
700+
y = (y/100) * monitor_size['height']
701+
702+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
703+
704+
width = ACCURATE_PIXEL_COUNT
705+
height = ACCURATE_PIXEL_COUNT
706+
# Use the screencapture utility to capture the screen with the cursor
707+
rect = f"-R{x1},{y1},{width},{height}"
708+
subprocess.run(["screencapture", "-C", rect, file_path])
709+
710+
screenshots_dir = "screenshots"
711+
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
712+
713+
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
714+
715+
578716
def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
717+
file_path=os.path.join("screenshots", "screenshot.png")
579718
user_platform = platform.system()
580719

581720
if user_platform == "Windows":
@@ -585,8 +724,10 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
585724
# Use xlib to prevent scrot dependency for Linux
586725
screen = Xlib.display.Display().screen()
587726
size = screen.width_in_pixels, screen.height_in_pixels
727+
monitor_size["width"] = size[0]
728+
monitor_size["height"] = size[1]
588729
screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
589-
screenshot.save(file_path)
730+
screenshot.save(file_path)
590731
elif user_platform == "Darwin": # (Mac OS)
591732
# Use the screencapture utility to capture the screen with the cursor
592733
subprocess.run(["screencapture", "-C", file_path])
@@ -634,9 +775,16 @@ def main_entry():
634775
default="gpt-4-vision-preview",
635776
)
636777

778+
parser.add_argument(
779+
"-accurate",
780+
help="Activate Reflective Mouse Click Mode",
781+
action="store_true",
782+
required=False,
783+
)
784+
637785
try:
638786
args = parser.parse_args()
639-
main(args.model)
787+
main(args.model, accurate_mode=args.accurate)
640788
except KeyboardInterrupt:
641789
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
642790

0 commit comments

Comments
 (0)