-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_augmentation.py
124 lines (107 loc) · 5.17 KB
/
data_augmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import openai
import ir_datasets
import jsonlines
from tqdm import tqdm
import argparse
import glob
import os
import time
os.environ["IR_DATASETS_HOME"] = 'cache'
def clean_text(text):
# remove line breaks
text = text.replace('\n', ' ')
# remove tabs
text = text.replace('\t', ' ')
# remove multiple spaces
text = ' '.join(text.split())
# remove leading and trailing spaces
text = text.strip()
# remove \r
text = text.replace('\r', '')
return text
def main(start, end, num_example):
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")
corpus = {}
for doc in dataset.docs_iter():
corpus[doc.doc_id] = {
"title": doc.title,
"detailed_description": clean_text(doc.detailed_description),
"summary": clean_text(doc.summary),
"eligibility": clean_text(doc.eligibility),
}
files = os.listdir('data/generated_train_v2')
for file in tqdm(files[start:end]):
generated_data = []
with jsonlines.open(f'data/generated_train_v2/{file}') as reader:
for obj in tqdm(reader):
doc_id = obj['doc_id']
doc = corpus[doc_id]
doc_text = f"Title: {doc['title']}\nSummary: {doc['summary']}\nDescription: {doc['detailed_description']}\nEligibility: {doc['eligibility']}"
seed_trial = doc_text
seed_note = obj['generated_text']
while True:
messages = [
{"role": "system",
"content": "You are a clinical trial specialist. You can generate patient descriptions that "
"are best suited for particular clinical trials."},
{"role": "user",
"content": f"The clinical trail is:\n{seed_trial}\nAn example of a patient description that suits this clinical trial is:: {seed_note}.\n\n Can you generate other {num_example} patient descriptions that vary from each other but all suits this clinical trial? Put your answer in a list."}
]
try:
responses = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.7,
frequency_penalty=0.6
)
generated_text = responses['choices'][0]['message']["content"].replace('\n\n', '\n')
items = generated_text.split('\n')
if len(items) <= num_example+1:
for item in items:
if item.strip() != '':
if item.strip()[:2] in ('1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.', '9.'):
generated_data.append({
"doc_id": doc_id,
"generated_text": item[2:].strip(),
})
except openai.error.APIError as e:
# Handle API error here, e.g. retry or log
print(f"OpenAI API returned an API Error: {e}")
time.sleep(5)
continue
except openai.error.APIConnectionError as e:
# Handle connection error here
print(f"Failed to connect to OpenAI API: {e}")
time.sleep(5)
continue
except openai.error.RateLimitError as e:
# Handle rate limit error (we recommend using exponential backoff)
print(f"OpenAI API request exceeded rate limit: {e}")
time.sleep(5)
continue
except openai.error.InvalidRequestError as e:
# Handle invalid request error
print(f"OpenAI API request was invalid: {e}")
pass
except openai.error.AuthenticationError as e:
# Handle authentication error
print(f"OpenAI API request failed authentication: {e}")
raise e
except openai.error.Timeout as e:
# Handle timeout error
print(f"OpenAI API request timed out: {e}")
time.sleep(5)
continue
except Exception as e:
print(f"Unknown error: {e}")
raise e
break
with jsonlines.open(f"data/augmented_generated_data_v2/{file}", "w") as writer:
writer.write_all(generated_data)
if __name__ == "__main__":
argparse = argparse.ArgumentParser()
argparse.add_argument("--start", type=int, required=True)
argparse.add_argument("--end", type=int, required=True)
argparse.add_argument("--num_example", type=int, default=3)
args = argparse.parse_args()
main(args.start, args.end, args.num_example)