@@ -48,6 +48,9 @@ async def get_next_action(model, messages, objective, session_id):
48
48
if model == "gpt-4-with-ocr" :
49
49
operation = await call_gpt_4o_with_ocr (messages , objective , model )
50
50
return operation , None
51
+ if model == "o1-with-ocr" :
52
+ operation = await call_o1_with_ocr (messages , objective , model )
53
+ return operation , None
51
54
if model == "agent-1" :
52
55
return "coming soon"
53
56
if model == "gemini-pro-vision" :
@@ -231,7 +234,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
231
234
messages .append (vision_message )
232
235
233
236
response = client .chat .completions .create (
234
- model = "gpt-4o " ,
237
+ model = "o1 " ,
235
238
messages = messages ,
236
239
temperature = 0.7 ,
237
240
max_tokens = 3000 ,
@@ -307,6 +310,121 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
307
310
return gpt_4_fallback (messages , objective , model )
308
311
309
312
313
+ async def call_o1_with_ocr (messages , objective , model ):
314
+ if config .verbose :
315
+ print ("[call_o1_with_ocr]" )
316
+
317
+ # Construct the path to the file within the package
318
+ try :
319
+ time .sleep (1 )
320
+ client = config .initialize_openai ()
321
+
322
+ confirm_system_prompt (messages , objective , model )
323
+ screenshots_dir = "screenshots"
324
+ if not os .path .exists (screenshots_dir ):
325
+ os .makedirs (screenshots_dir )
326
+
327
+ screenshot_filename = os .path .join (screenshots_dir , "screenshot.png" )
328
+ # Call the function to capture the screen with the cursor
329
+ capture_screen_with_cursor (screenshot_filename )
330
+
331
+ with open (screenshot_filename , "rb" ) as img_file :
332
+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
333
+
334
+ if len (messages ) == 1 :
335
+ user_prompt = get_user_first_message_prompt ()
336
+ else :
337
+ user_prompt = get_user_prompt ()
338
+
339
+ vision_message = {
340
+ "role" : "user" ,
341
+ "content" : [
342
+ {"type" : "text" , "text" : user_prompt },
343
+ {
344
+ "type" : "image_url" ,
345
+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
346
+ },
347
+ ],
348
+ }
349
+ messages .append (vision_message )
350
+
351
+ response = client .chat .completions .create (
352
+ model = "gpt-4o" ,
353
+ messages = messages ,
354
+ temperature = 0.7 ,
355
+ max_tokens = 3000 ,
356
+ )
357
+
358
+ content = response .choices [0 ].message .content
359
+
360
+ content = clean_json (content )
361
+
362
+ # used later for the messages
363
+ content_str = content
364
+
365
+ content = json .loads (content )
366
+
367
+ processed_content = []
368
+
369
+ for operation in content :
370
+ if operation .get ("operation" ) == "click" :
371
+ text_to_click = operation .get ("text" )
372
+ if config .verbose :
373
+ print (
374
+ "[call_o1_with_ocr][click] text_to_click" ,
375
+ text_to_click ,
376
+ )
377
+ # Initialize EasyOCR Reader
378
+ reader = easyocr .Reader (["en" ])
379
+
380
+ # Read the screenshot
381
+ result = reader .readtext (screenshot_filename )
382
+
383
+ text_element_index = get_text_element (
384
+ result , text_to_click , screenshot_filename
385
+ )
386
+ coordinates = get_text_coordinates (
387
+ result , text_element_index , screenshot_filename
388
+ )
389
+
390
+ # add `coordinates`` to `content`
391
+ operation ["x" ] = coordinates ["x" ]
392
+ operation ["y" ] = coordinates ["y" ]
393
+
394
+ if config .verbose :
395
+ print (
396
+ "[call_o1_with_ocr][click] text_element_index" ,
397
+ text_element_index ,
398
+ )
399
+ print (
400
+ "[call_o1_with_ocr][click] coordinates" ,
401
+ coordinates ,
402
+ )
403
+ print (
404
+ "[call_o1_with_ocr][click] final operation" ,
405
+ operation ,
406
+ )
407
+ processed_content .append (operation )
408
+
409
+ else :
410
+ processed_content .append (operation )
411
+
412
+ # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
413
+ assistant_message = {"role" : "assistant" , "content" : content_str }
414
+ messages .append (assistant_message )
415
+
416
+ return processed_content
417
+
418
+ except Exception as e :
419
+ print (
420
+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [{ model } ] That did not work. Trying another method { ANSI_RESET } "
421
+ )
422
+ if config .verbose :
423
+ print ("[Self-Operating Computer][Operate] error" , e )
424
+ traceback .print_exc ()
425
+ return gpt_4_fallback (messages , objective , model )
426
+
427
+
310
428
async def call_gpt_4o_labeled (messages , objective , model ):
311
429
time .sleep (1 )
312
430
0 commit comments