-
Notifications
You must be signed in to change notification settings - Fork 131
/
Copy pathevaluate.py
executable file
·109 lines (91 loc) · 3.88 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import argparse
import numpy as np
from tqdm import tqdm
from pebble import ProcessPool
from concurrent.futures import TimeoutError
from grader import *
from parser import *
from utils import load_jsonl
from python_executor import PythonExecutor
def evaluate(data_name, prompt_type, samples: list=None, file_path: str=None, max_num_samples=None, execute=False):
assert samples or file_path, "samples or file_path must be provided"
if not samples:
samples = list(load_jsonl(file_path))
if 'idx' in samples[0]:
samples = {sample['idx']: sample for sample in samples}.values()
samples = sorted(samples, key=lambda x: x['idx'])
else:
samples = [dict(idx=idx, **sample) for idx, sample in enumerate(samples)]
if max_num_samples:
print(f"max_num_samples: {max_num_samples} / {len(samples)}")
samples = samples[:max_num_samples]
# parse gt
for sample in samples:
sample['gt_cot'], sample['gt'] = parse_ground_truth(sample, data_name)
params = [(idx, pred, sample['gt']) for idx, sample in enumerate(samples) for pred in sample['pred']]
scores = []
timeout_cnt = 0
with ProcessPool(max_workers=1) as pool:
future = pool.map(math_equal_process, params, timeout=3)
iterator = future.result()
with tqdm(total=len(samples), desc="Evaluate") as progress_bar:
while True:
try:
result = next(iterator)
scores.append(result)
except StopIteration:
break
except TimeoutError as error:
print(error)
scores.append(False)
timeout_cnt += 1
except Exception as error:
print(error.traceback)
exit()
progress_bar.update(1)
idx = 0
score_mat = []
for sample in samples:
sample['score'] = scores[idx: idx+len(sample['pred'])]
assert len(sample['score']) == len(sample['pred'])
score_mat.append(sample['score'])
idx += len(sample['pred'])
max_len = max([len(s) for s in score_mat])
for i, s in enumerate(score_mat):
if len(s) < max_len:
score_mat[i] = s + [s[-1]] * (max_len - len(s)) # pad
# output mean of each column of scores
col_means= np.array(score_mat).mean(axis=0)
mean_score = list(np.round(col_means * 100, decimals=1))
result_json = {
"num_samples": len(samples),
"num_scores": len(scores),
"timeout_samples": timeout_cnt,
"empty_samples": len([s for s in samples if not s['pred'][-1]]),
"acc": mean_score[0]
}
# each type score
if "type" in samples[0]:
type_scores = {}
for sample in samples:
if sample['type'] not in type_scores:
type_scores[sample['type']] = []
type_scores[sample['type']].append(sample['score'][-1])
type_scores = {k: np.round(np.array(v).mean() * 100, decimals=1) for k, v in type_scores.items()}
type_scores = {k: v for k, v in sorted(type_scores.items(), key=lambda item: item[0])}
result_json['type_acc'] = type_scores
print(result_json)
return samples, result_json
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--data_name", type=str, default="math")
parser.add_argument("--prompt_type", type=str, default="tool-integrated")
parser.add_argument("--file_path", type=str, default=None, required=True)
parser.add_argument("--max_num_samples", type=int, default=None)
parser.add_argument("--execute", action="store_true")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
evaluate(data_name=args.data_name, prompt_type=args.prompt_type, file_path=args.file_path,
max_num_samples=args.max_num_samples, execute=args.execute)