-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauto_extract_gpt.py
106 lines (72 loc) · 2.36 KB
/
auto_extract_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import google.generativeai as genai
from google.generativeai.types import safety_types
import os
from dotenv import load_dotenv
from dotenv import find_dotenv
load_dotenv(find_dotenv())
from pathlib import Path
import pandas as pd
import time
from openai import OpenAI
WS = Path(__file__).parent
PAPERS = WS / 'database' / 'papers'
def query_gpt4o(system_prompt, prompt_text):
client = OpenAI(api_key=os.getenv('GPT-4O-KEY'))
completion = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt_text}
]
)
resp = completion.choices[0].message.content
return resp
def load_markdown():
files = []
for i in PAPERS.iterdir():
if not i.is_dir():
continue
for j in i.iterdir():
if 'checked' not in j.name:
continue
if j.suffix == '.markdown':
files.append(j)
elif j.suffix == '.md':
files.append(j)
return files
def extract_one_paper(
content,
system_prompt,
user_prompt,
questions,
save_path):
for i in range(len(questions["question"])):
formatted_user_prompt = user_prompt.format(
paper_content=content,
question=questions["question"][i])
resp = query_gpt4o(system_prompt, formatted_user_prompt)
questions.at[i, 'Answer'] = resp
time.sleep(2)
questions.to_excel(save_path, index=False)
def extract_papers(files, system_prompt, user_prompt, questions):
for f in files:
print(f.name)
save_path = f.parent / f'{f.parent.name}_related_sentence.xlsx'
with open(f, encoding='utf-8') as fd:
content = fd.read()
if save_path.exists():
continue
extract_one_paper(
content, system_prompt,
user_prompt, questions, save_path)
break
def work():
with open(WS / 'prompt' / 'system.txt') as fd:
system_prompt = fd.read()
with open(WS / 'prompt' / 'auto_extract.txt') as fd:
user_prompt = fd.read()
questions = pd.read_excel(str(WS / 'database' / 'Questions.xlsx'))
files = load_markdown()
extract_papers(files, system_prompt, user_prompt, questions)
if __name__ == '__main__':
work()