-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.py
82 lines (65 loc) · 2.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from openai import OpenAI
from dotenv import load_dotenv
from utils import draw_circle, encode_image
import json
load_dotenv()
client = OpenAI()
def ask_gpt4_vision(system_instrutions, object_to_detect, image_path):
base64_image = encode_image(image_path)
try:
response = client.chat.completions.create(
model="gpt-4-vision-preview",
max_tokens=100,
messages=[
{
"role": "system",
"content": system_instrutions
},
{
"role": "user",
"content": [
{"type": "text", "text": f"Detect: {object_to_detect}"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
}
],
)
content = response.choices[0].message.content
json_str = content.strip('`json\n') # Extract the JSON part from the string (remove the ```json and ``` at both ends)
coordinates = json.loads(json_str) # Convert the JSON string into a Python dictionary
print('-' * 50)
print("Detect:", object_to_detect)
print("Details:", coordinates["details"])
print(f"Coordinates: [{coordinates['x']}, {coordinates['y']}]")
print('-' * 50)
except Exception as e:
print(e)
coordinates = {"x": 0, "y": 0, "details": ""}
return coordinates
image_path = "assets/kitten-and-puppy.webp"
# image_path = "assets/puppy.jpg"
system_instructions = """
As an image recognition expert, your task is to analyze images and provide
output in JSON format with the following keys only: 'x', 'y', and 'details'.
- 'x' and 'y' should represent the coordinates of the center of the detected
object within the image, with the reference point [0,0] at the top left corner.
- 'details' should provide a brief description of the object identified in the image.
For cases involving the identification of people or animals, focus on locating and
identifying the face of the person or animal. Ensure that the given 'x' and 'y'
coordinates correspond to the center of the identified face.
Please adhere strictly to this output structure:
{
"x": value,
"y": value,
"details": "Description"
}
Note: Do not include any additional data or keys outside of what has been specified.
"""
detect = "dogs nose"
coordinates = ask_gpt4_vision(system_instructions, detect, image_path)
draw_circle(image_path, coordinates)