12
12
import argparse
13
13
import platform
14
14
import Xlib .display
15
+ import Xlib .X
16
+ import Xlib .Xutil # not sure if Xutil is necessary
15
17
16
18
from prompt_toolkit import prompt
17
19
from prompt_toolkit .shortcuts import message_dialog
31
33
client .api_key = os .getenv ("OPENAI_API_KEY" )
32
34
client .base_url = os .getenv ("OPENAI_API_BASE_URL" , client .base_url )
33
35
36
+ monitor_size = {
37
+ "width" : 1920 ,
38
+ "height" : 1080 ,
39
+ }
40
+
34
41
VISION_PROMPT = """
35
42
You are a Self-Operating Computer. You use the same operating system as a human.
36
43
46
53
Here are the response formats below.
47
54
48
55
1. CLICK
49
- Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
56
+ Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
57
+ Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%"
50
58
51
59
2. TYPE
52
60
Response: TYPE "value you want to type"
88
96
Objective: {objective}
89
97
"""
90
98
99
+ ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100
+ ACCURATE_MODE_VISION_PROMPT = """
101
+ It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102
+ As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
103
+ This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.
104
+
105
+ If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer.
106
+ Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer.
107
+
108
+ There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.
109
+
110
+ Please use this context as additional info to further refine the "percent" location in the CLICK action!
111
+ """
91
112
92
113
USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
93
114
@@ -171,7 +192,7 @@ def supports_ansi():
171
192
ANSI_BRIGHT_MAGENTA = ""
172
193
173
194
174
- def main (model ):
195
+ def main (model , accurate_mode ):
175
196
"""
176
197
Main function for the Self-Operating Computer
177
198
"""
@@ -209,7 +230,7 @@ def main(model):
209
230
if DEBUG :
210
231
print ("[loop] messages before next action:\n \n \n " , messages [1 :])
211
232
try :
212
- response = get_next_action (model , messages , objective )
233
+ response = get_next_action (model , messages , objective , accurate_mode )
213
234
action = parse_oai_response (response )
214
235
action_type = action .get ("type" )
215
236
action_detail = action .get ("data" )
@@ -291,9 +312,19 @@ def format_vision_prompt(objective, previous_action):
291
312
return prompt
292
313
293
314
294
- def get_next_action (model , messages , objective ):
315
+ def format_accurate_mode_vision_prompt (prev_x , prev_y ):
316
+ """
317
+ Format the accurate mode vision prompt
318
+ """
319
+ width = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['width' ]) * 100
320
+ height = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['height' ]) * 100
321
+ prompt = ACCURATE_MODE_VISION_PROMPT .format (prev_x = prev_x , prev_y = prev_y , width = width , height = height )
322
+ return prompt
323
+
324
+
325
+ def get_next_action (model , messages , objective , accurate_mode ):
295
326
if model == "gpt-4-vision-preview" :
296
- content = get_next_action_from_openai (messages , objective )
327
+ content = get_next_action_from_openai (messages , objective , accurate_mode )
297
328
return content
298
329
elif model == "agent-1" :
299
330
return "coming soon"
@@ -314,8 +345,56 @@ def get_last_assistant_message(messages):
314
345
return messages [index ]
315
346
return None # Return None if no assistant message is found
316
347
348
+ def accurate_mode_double_check (pseudo_messages , prev_x , prev_y ):
349
+ """
350
+ Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351
+ """
352
+ try :
353
+ screenshot_filename = os .path .join (
354
+ "screenshots" , "screenshot_mini.png"
355
+ )
356
+ capture_mini_screenshot_with_cursor (file_path = screenshot_filename , x = prev_x , y = prev_y )
357
+
358
+ new_screenshot_filename = os .path .join (
359
+ "screenshots" , "screenshot_mini_with_grid.png"
360
+ )
361
+
362
+ with open (new_screenshot_filename , "rb" ) as img_file :
363
+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
364
+
365
+ accurate_vision_prompt = format_accurate_mode_vision_prompt (prev_x , prev_y )
317
366
318
- def get_next_action_from_openai (messages , objective ):
367
+ accurate_mode_message = {
368
+ "role" : "user" ,
369
+ "content" : [
370
+ {"type" : "text" , "text" : accurate_vision_prompt },
371
+ {
372
+ "type" : "image_url" ,
373
+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
374
+ },
375
+ ],
376
+ }
377
+
378
+ pseudo_messages .append (accurate_mode_message )
379
+
380
+ response = client .chat .completions .create (
381
+ model = "gpt-4-vision-preview" ,
382
+ messages = pseudo_messages ,
383
+ presence_penalty = 1 ,
384
+ frequency_penalty = 1 ,
385
+ temperature = 0.7 ,
386
+ max_tokens = 300 ,
387
+ )
388
+
389
+ content = response .choices [0 ].message .content
390
+
391
+ return content
392
+ except Exception as e :
393
+ print (f"Error reprompting model for accurate_mode: { e } " )
394
+ return "ERROR"
395
+
396
+
397
+ def get_next_action_from_openai (messages , objective , accurate_mode ):
319
398
"""
320
399
Get the next action for Self-Operating Computer
321
400
"""
@@ -355,6 +434,7 @@ def get_next_action_from_openai(messages, objective):
355
434
},
356
435
],
357
436
}
437
+
358
438
# create a copy of messages and save to pseudo_messages
359
439
pseudo_messages = messages .copy ()
360
440
pseudo_messages .append (vision_message )
@@ -374,7 +454,23 @@ def get_next_action_from_openai(messages, objective):
374
454
"content" : "`screenshot.png`" ,
375
455
}
376
456
)
457
+
377
458
content = response .choices [0 ].message .content
459
+
460
+ if accurate_mode :
461
+ if content .startswith ("CLICK" ):
462
+ # Adjust pseudo_messages to include the accurate_mode_message
463
+
464
+ click_data = re .search (r"CLICK \{ (.+) \}" , content ).group (1 )
465
+ click_data_json = json .loads (f"{{{ click_data } }}" )
466
+ prev_x = click_data_json ["x" ]
467
+ prev_y = click_data_json ["y" ]
468
+
469
+ if DEBUG :
470
+ print (f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } " )
471
+ content = accurate_mode_double_check (pseudo_messages , prev_x , prev_y )
472
+ assert content != "ERROR" , "ERROR: accurate_mode_double_check failed"
473
+
378
474
return content
379
475
380
476
except Exception as e :
@@ -445,7 +541,6 @@ def summarize(messages, objective):
445
541
print (f"Error parsing JSON: { e } " )
446
542
return "Failed to summarize the workflow"
447
543
448
-
449
544
def mouse_click (click_detail ):
450
545
try :
451
546
x = convert_percent_to_decimal (click_detail ["x" ])
@@ -575,7 +670,51 @@ def search(text):
575
670
return "Open program: " + text
576
671
577
672
673
+ def capture_mini_screenshot_with_cursor (file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0 ):
674
+ user_platform = platform .system ()
675
+
676
+ if user_platform == "Linux" :
677
+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
678
+ y = float (y [:- 1 ])
679
+
680
+ x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
681
+ y = (y / 100 ) * monitor_size ['height' ]
682
+
683
+ # Define the coordinates for the rectangle
684
+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
685
+ x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
686
+
687
+ screenshot = ImageGrab .grab (bbox = (x1 , y1 , x2 , y2 ))
688
+ screenshot = screenshot .resize ((screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS ) # upscale the image so it's easier to see and percentage marks more visible
689
+ screenshot .save (file_path )
690
+
691
+ screenshots_dir = "screenshots"
692
+ grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
693
+
694
+ add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
695
+ elif user_platform == "Darwin" :
696
+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
697
+ y = float (y [:- 1 ])
698
+
699
+ x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
700
+ y = (y / 100 ) * monitor_size ['height' ]
701
+
702
+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
703
+
704
+ width = ACCURATE_PIXEL_COUNT
705
+ height = ACCURATE_PIXEL_COUNT
706
+ # Use the screencapture utility to capture the screen with the cursor
707
+ rect = f"-R{ x1 } ,{ y1 } ,{ width } ,{ height } "
708
+ subprocess .run (["screencapture" , "-C" , rect , file_path ])
709
+
710
+ screenshots_dir = "screenshots"
711
+ grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
712
+
713
+ add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
714
+
715
+
578
716
def capture_screen_with_cursor (file_path = os .path .join ("screenshots" , "screenshot.png" )):
717
+ file_path = os .path .join ("screenshots" , "screenshot.png" )
579
718
user_platform = platform .system ()
580
719
581
720
if user_platform == "Windows" :
@@ -585,8 +724,10 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
585
724
# Use xlib to prevent scrot dependency for Linux
586
725
screen = Xlib .display .Display ().screen ()
587
726
size = screen .width_in_pixels , screen .height_in_pixels
727
+ monitor_size ["width" ] = size [0 ]
728
+ monitor_size ["height" ] = size [1 ]
588
729
screenshot = ImageGrab .grab (bbox = (0 , 0 , size [0 ], size [1 ]))
589
- screenshot .save (file_path )
730
+ screenshot .save (file_path )
590
731
elif user_platform == "Darwin" : # (Mac OS)
591
732
# Use the screencapture utility to capture the screen with the cursor
592
733
subprocess .run (["screencapture" , "-C" , file_path ])
@@ -634,9 +775,16 @@ def main_entry():
634
775
default = "gpt-4-vision-preview" ,
635
776
)
636
777
778
+ parser .add_argument (
779
+ "-accurate" ,
780
+ help = "Activate Reflective Mouse Click Mode" ,
781
+ action = "store_true" ,
782
+ required = False ,
783
+ )
784
+
637
785
try :
638
786
args = parser .parse_args ()
639
- main (args .model )
787
+ main (args .model , accurate_mode = args . accurate )
640
788
except KeyboardInterrupt :
641
789
print (f"\n { ANSI_BRIGHT_MAGENTA } Exiting..." )
642
790
0 commit comments