ARC-VL/method.py at main · InternLM/ARC-VL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import numpy as np
import os
import utils
import arc_dataset
import draw
import ast
import openai
from openai import OpenAI

def VLSR_Extract_Rule(client, model="o4-mini", data=None, id=0, save_root="images/ARC-AGI/"):
    prompt = r"I will now provide you with several input and output images about 2D grids. You need to summarize the grid-changing rule from it. Output the rule you learned within \\boxed{}."
    conversation_history = []
    content = []
    example_num = len(data["example"])
    input_test = data["input_test"]
    gt = data["gt"]
    os.makedirs(save_root, exist_ok=True)


    content.append({"type": "text", "text": prompt})
    for i in range(example_num):
        input_example, output_example = data["example"][i]
        os.makedirs(os.path.join(save_root, str(id)), exist_ok=True)
        save_path_input = os.path.join(save_root, str(id), f"input_{i}.jpg")
        save_path_output = os.path.join(save_root, str(id), f"output_{i}.jpg")
        draw.generate_grid_image(grid=input_example, output_filename=save_path_input)
        draw.generate_grid_image(grid=output_example, output_filename=save_path_output)

        content.append({"type": "text", "text": "Example input " + str(i+1)})
        content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_input)}" }},)
        content.append({"type": "text", "text": "Example output " + str(i+1)})
        content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_output)}" }},)

    conversation_history.append({"role": "user",
        "content": content
        }
    )
    response = client.chat.completions.create(
        model=model,
        messages = conversation_history,
        temperature = 0.7,
        n = 1
    )
    rule = response.choices[0].message.content
    rule_extracted = utils.extract_answer(rule)
    return rule_extracted

def VLSR_Apply_Rule(client, model="o4-mini", data=None, id=0, rule=None):
    prompt = r"I will provide you with several input and output matrices. You need to find the matrix-changing rule from it and  apply it to the new input. Put the output matrix within \\boxed{}."
    conversation_history = []
    content = []
    example_num = len(data["example"])
    input_test = data["input_test"]
    gt = data["gt"]


    content.append({"type": "text", "text": prompt})
    if rule is not None:
        prompt_rule = r"Here is a possible rule for your reference. Note that the rule is described in color and each color represents a value in the matrix: [0:black; 1:blue; 2:red; 3:green; 4:yellow; 5:grey; 6:pink; 7:orange; 8:light blue; 9:brown]. You need to first check the correctness of the rule based on the examples. If the rule is correct, apply it to the new input. Otherwise, summarize a new rule and apply it to the new input."
        content.append({"type": "text", "text": prompt_rule})
        content.append({"type": "text", "text": "Rule: " + str(rule)})

    for i in range(example_num):
        input_example, output_example = data["example"][i]
        content.append({"type": "text", "text": "Example input " + str(i+1) + "\n"})
        content.append({"type": "text", "text": str(input_example)})
        content.append({"type": "text", "text": "Example output " + str(i+1) + "\n"})
        content.append({"type": "text", "text": str(output_example)})

    content.append({"type": "text", "text": "New Input: " + "\n"})
    content.append({"type": "text", "text": str(input_test)})

    conversation_history.append({"role": "user",
        "content": content
        }
    )
    response = client.chat.completions.create(
        model=model,
        messages = conversation_history,
        temperature = 0.7,
        n = 1
    )
    answer = response.choices[0].message.content
    answer_extracted = utils.extract_answer(answer)
    if answer_extracted is None or answer_extracted == "":
        return answer
    return answer_extracted

def VLSR(client, model="o4-mini", data=None, id=0, save_root="images/ARC-AGI/"):
    rule = VLSR_Extract_Rule(client, model, data, id, save_root)
    answer = VLSR_Apply_Rule(client, model, data, id, rule)
    return answer, rule

def MSSC_Check(client, model="o4-mini", data=None, id=0, save_root="images/ARC-AGI/", answer = None):
    prompt = r"I will now provide you with several input and output example images, which follows a specific changing rule. Then, I will give you another input and output pair, determine whether the new pair also follows the same changing rule. Add your final judgment at the end of your replay: \\boxed{True} or \\boxed{False}."
    conversation_history = []
    content = []
    example_num = len(data["example"])
    input_test = data["input_test"]
    gt = data["gt"]
    os.makedirs(save_root, exist_ok=True)

    content.append({"type": "text", "text": prompt})
    for i in range(example_num):
        os.makedirs(os.path.join(save_root, str(id)), exist_ok=True)
        input_example, output_example = data["example"][i]
        save_path_input = os.path.join(save_root, str(id), f"input_{i}.jpg")
        draw.generate_grid_image(grid=input_example, output_filename=save_path_input)
        save_path_output = os.path.join(save_root, str(id), f"output_{i}.jpg")
        draw.generate_grid_image(grid=output_example, output_filename=save_path_output)

        content.append({"type": "text", "text": "Example input " + str(i+1)})
        content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_input)}" }},)
        content.append({"type": "text", "text": "Example output " + str(i+1)})
        content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_input)}" }},)

    save_path_input = os.path.join(save_root, str(id), f"input_test.jpg")
    draw.generate_grid_image(grid=input_test, output_filename=save_path_input)

    try:
        answer_list = ast.literal_eval(answer)
    except:
        answer_list = utils.latex_matrix_to_list(answer)

    if answer_list is None:
        return None

    try:
        save_path_output = os.path.join(save_root, str(id), f"output_pred.jpg")
        draw.generate_grid_image(grid=answer_list, output_filename=save_path_output)
    except:
        return None

    content.append({"type": "text", "text": "New input: "})
    content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_input)}" }},)
    content.append({"type": "text", "text": "New output: "})
    content.append({"type": "image_url", "image_url":{"url": f"data:image/jpeg;base64,{utils.encode_image(save_path_output)}" }},)

    conversation_history.append({"role": "user",
        "content": content
        }
    )
    response = client.chat.completions.create(
        model=model,
        messages = conversation_history,
        temperature = 0.7,
        n = 1
    )
    judge = response.choices[0].message.content
    judge_extracted = utils.extract_answer(judge)
    return judge_extracted


def MSSC_Correction(client, model="o4-mini", data=None, id=0, answer = None, judge=None):

    prompt = r"I will now provide you with several input and output examples following a specific matrix changing rule. You need to summarize the matrix-changing rule and apply it to the new input. However, the original result you get is likely wrong. Check the result carefully and output a new output matrix within \\boxed{}."
    conversation_history = []
    content = []
    example_num = len(data["example"])
    input_test = data["input_test"]
    gt = data["gt"]

    for i in range(example_num):
        input_example, output_example = data["example"][i]
        content.append({"type": "text", "text": "Example input " + str(i+1) + "\n"})
        content.append({"type": "text", "text": str(input_example)})
        content.append({"type": "text", "text": "Example output " + str(i+1) + "\n"})
        content.append({"type": "text", "text": str(output_example)})

    content.append({"type": "text", "text": "New Input: " + "\n"})
    content.append({"type": "text", "text": str(input_test)})
    content.append({"type": "text", "text": "Original Output: " + "\n"})
    content.append({"type": "text", "text": str(answer)})

    conversation_history.append({"role": "user",
        "content": content
        }
    )
    response = client.chat.completions.create(
        model=model,
        messages = conversation_history,
        temperature = 0.7,
        n = 1
    )
    answer = response.choices[0].message.content
    answer_extracted = utils.extract_answer(answer)
    if answer_extracted is None or answer_extracted == "":
        return answer
    return answer_extracted


def MSSC(client, model="o4-mini", data=None, id=0, save_root="images/ARC-AGI/", answer=None, max_round=3):
    for round in range(max_round):
        judge = MSSC_Check(client, model, data, id ,save_root, answer)
        if judge is None:
            return answer
        if judge == "True":
            return answer
        answer_new = MSSC_Correction(client, model, data, id, answer, judge)
        answer = answer_new


    return answer_new

def Reason(client, model="o4-mini", data=None, id=0, save_root="images/ARC-AGI/", max_round=3):
    answer, rule = VLSR(client, model, data, id, save_root)
    answer = MSSC(client, model, data, id, save_root, answer, max_round)
    return answer, rule