Skip to content

Commit ac71d7a

Browse files
committed
add o1-with-ocr
1 parent f370577 commit ac71d7a

File tree

3 files changed

+124
-10
lines changed

3 files changed

+124
-10
lines changed

operate/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ def validation(self, model, voice_mode):
9393
model == "gpt-4"
9494
or voice_mode
9595
or model == "gpt-4-with-som"
96-
or model == "gpt-4-with-ocr",
96+
or model == "gpt-4-with-ocr"
97+
or model == "o1-with-ocr",
9798
)
9899
self.require_api_key(
99100
"GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"

operate/models/apis.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ async def get_next_action(model, messages, objective, session_id):
4848
if model == "gpt-4-with-ocr":
4949
operation = await call_gpt_4o_with_ocr(messages, objective, model)
5050
return operation, None
51+
if model == "o1-with-ocr":
52+
operation = await call_o1_with_ocr(messages, objective, model)
53+
return operation, None
5154
if model == "agent-1":
5255
return "coming soon"
5356
if model == "gemini-pro-vision":
@@ -231,7 +234,7 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
231234
messages.append(vision_message)
232235

233236
response = client.chat.completions.create(
234-
model="gpt-4o",
237+
model="o1",
235238
messages=messages,
236239
temperature=0.7,
237240
max_tokens=3000,
@@ -307,6 +310,121 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
307310
return gpt_4_fallback(messages, objective, model)
308311

309312

313+
async def call_o1_with_ocr(messages, objective, model):
314+
if config.verbose:
315+
print("[call_o1_with_ocr]")
316+
317+
# Construct the path to the file within the package
318+
try:
319+
time.sleep(1)
320+
client = config.initialize_openai()
321+
322+
confirm_system_prompt(messages, objective, model)
323+
screenshots_dir = "screenshots"
324+
if not os.path.exists(screenshots_dir):
325+
os.makedirs(screenshots_dir)
326+
327+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
328+
# Call the function to capture the screen with the cursor
329+
capture_screen_with_cursor(screenshot_filename)
330+
331+
with open(screenshot_filename, "rb") as img_file:
332+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
333+
334+
if len(messages) == 1:
335+
user_prompt = get_user_first_message_prompt()
336+
else:
337+
user_prompt = get_user_prompt()
338+
339+
vision_message = {
340+
"role": "user",
341+
"content": [
342+
{"type": "text", "text": user_prompt},
343+
{
344+
"type": "image_url",
345+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
346+
},
347+
],
348+
}
349+
messages.append(vision_message)
350+
351+
response = client.chat.completions.create(
352+
model="gpt-4o",
353+
messages=messages,
354+
temperature=0.7,
355+
max_tokens=3000,
356+
)
357+
358+
content = response.choices[0].message.content
359+
360+
content = clean_json(content)
361+
362+
# used later for the messages
363+
content_str = content
364+
365+
content = json.loads(content)
366+
367+
processed_content = []
368+
369+
for operation in content:
370+
if operation.get("operation") == "click":
371+
text_to_click = operation.get("text")
372+
if config.verbose:
373+
print(
374+
"[call_o1_with_ocr][click] text_to_click",
375+
text_to_click,
376+
)
377+
# Initialize EasyOCR Reader
378+
reader = easyocr.Reader(["en"])
379+
380+
# Read the screenshot
381+
result = reader.readtext(screenshot_filename)
382+
383+
text_element_index = get_text_element(
384+
result, text_to_click, screenshot_filename
385+
)
386+
coordinates = get_text_coordinates(
387+
result, text_element_index, screenshot_filename
388+
)
389+
390+
# add `coordinates`` to `content`
391+
operation["x"] = coordinates["x"]
392+
operation["y"] = coordinates["y"]
393+
394+
if config.verbose:
395+
print(
396+
"[call_o1_with_ocr][click] text_element_index",
397+
text_element_index,
398+
)
399+
print(
400+
"[call_o1_with_ocr][click] coordinates",
401+
coordinates,
402+
)
403+
print(
404+
"[call_o1_with_ocr][click] final operation",
405+
operation,
406+
)
407+
processed_content.append(operation)
408+
409+
else:
410+
processed_content.append(operation)
411+
412+
# wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
413+
assistant_message = {"role": "assistant", "content": content_str}
414+
messages.append(assistant_message)
415+
416+
return processed_content
417+
418+
except Exception as e:
419+
print(
420+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
421+
)
422+
if config.verbose:
423+
print("[Self-Operating Computer][Operate] error", e)
424+
traceback.print_exc()
425+
return gpt_4_fallback(messages, objective, model)
426+
427+
310428
async def call_gpt_4o_labeled(messages, objective, model):
311429
time.sleep(1)
312430

operate/models/prompts.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,15 @@ def get_system_prompt(model, objective):
232232
os_search_str=os_search_str,
233233
operating_system=operating_system,
234234
)
235-
elif model == "gpt-4-with-ocr":
236-
prompt = SYSTEM_PROMPT_OCR.format(
237-
objective=objective,
238-
cmd_string=cmd_string,
239-
os_search_str=os_search_str,
240-
operating_system=operating_system,
241-
)
242-
elif model == "claude-3":
235+
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3":
236+
print("adding SYSTEM_PROMPT_OCR")
243237
prompt = SYSTEM_PROMPT_OCR.format(
244238
objective=objective,
245239
cmd_string=cmd_string,
246240
os_search_str=os_search_str,
247241
operating_system=operating_system,
248242
)
243+
249244
else:
250245
prompt = SYSTEM_PROMPT_STANDARD.format(
251246
objective=objective,

0 commit comments

Comments
 (0)