-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_generation.py
102 lines (68 loc) · 4.09 KB
/
run_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import data_generation
import openai
import os
import json
import random
import csv
import time
openai.api_key = ''
output_file_path = 'output-140-2-roles-evals.csv'
output_problem_path = 'problems-140-2-roles-evals.csv'
batch_size = 20
#QUERY = "A standard six-sided fair die is rolled four times. The probability that the product of all four numbers rolled is a perfect square is $\\tfrac{m}{n}$, where $m$ and $n$ are relatively prime positive integers. Find $m+n$.\n"
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/algebra'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/counting_and_probability'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/geometry'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/number_theory'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/prealgebra'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/precalculus'
#directory = '/Users/corneliaweinzierl/Downloads/MATH/train/intermediate_algebra'
directories = ['/Users/corneliaweinzierl/Downloads/MATH/train/algebra', '/Users/corneliaweinzierl/Downloads/MATH/train/counting_and_probability', '/Users/corneliaweinzierl/Downloads/MATH/train/geometry','/Users/corneliaweinzierl/Downloads/MATH/train/number_theory','/Users/corneliaweinzierl/Downloads/MATH/train/prealgebra','/Users/corneliaweinzierl/Downloads/MATH/train/precalculus','/Users/corneliaweinzierl/Downloads/MATH/train/intermediate_algebra']
ROLES = ["Mathematician", "Economist"]
def process_random_files(directory, batch_size, output_problem_path):
generation_count = 0
#json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
# Randomly sample batch_size number of files
#if batch_size < len(json_files):
# sampled_files = random.sample(json_files, batch_size)
#else:
# sampled_files = json_files
with open(output_problem_path, 'a', newline='', encoding='utf-8') as csvfile:
csvwriter = csv.writer(csvfile)
csvfile.seek(0, os.SEEK_END)
if csvfile.tell() == 0:
csvwriter.writerow(['Problem'])
for directory in directories:
json_files = [f for f in os.listdir(directory) if f.endswith('.json')]
sampled_files = random.sample(json_files, min(batch_size, len(json_files)))
for file in sampled_files:
file_path = os.path.join(directory, file)
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
problem = data.get("problem", "No problem found")
data_generation.construct_training_data(problem, ROLES, data_generation.ROLE_MAP, openai, output_file_path)
csvwriter.writerow([problem])
generation_count += 1
if generation_count % 20 == 0:
print("Pausing for 60 seconds after {} generations...".format(generation_count))
time.sleep(60)
def append_csv_files(source_csv_1, source_csv_2, destination_csv):
with open(destination_csv, 'w', newline='', encoding='utf-8') as dest_file:
csv_writer = csv.writer(dest_file)
with open(source_csv_1, 'r', encoding='utf-8') as src_file1:
csv_reader1 = csv.reader(src_file1)
for row in csv_reader1:
csv_writer.writerow(row)
with open(source_csv_2, 'r', encoding='utf-8') as src_file2:
csv_reader2 = csv.reader(src_file2)
for row in csv_reader2:
csv_writer.writerow(row)
source_csv_1 = 'output-1000-4-roles-extension.csv'
source_csv_2 = 'output-1000-4-roles.csv'
destination_csv = 'output-2000-4-roles.csv'
def main():
process_random_files(directory = directories, batch_size = batch_size, output_problem_path = output_problem_path)
#data_generation.construct_training_data(QUERY, ROLES, data_generation.ROLE_MAP, openai, output_file_path)
#append_csv_files(source_csv_1, source_csv_2, destination_csv)
if __name__ == "__main__":
main()