-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
468 lines (409 loc) · 20.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
import math
import os
from collections import Counter
from io import BytesIO
import openai
import pandas as pd
import pdfplumber
import requests
from flask import Flask, render_template, request
from flask_cors import CORS
from pathlib import Path
from llama_index import download_loader, GPTSimpleVectorIndex
from openai.embeddings_utils import cosine_similarity, get_embedding
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_type == "azure":
openai.api_version = os.environ.get("OPENAI_API_VERSION", "2023-03-15-preview")
openai_chat_model = os.environ.get("OPENAI_CHAT_MODEL", "gpt-3.5-turbo")
openai_embedding_model = os.environ.get("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
app = Flask(__name__, template_folder="./frontend/dist", static_folder="./frontend/dist/assets")
CORS(app)
class Chatbot():
def parse_paper(self, pdf):
print("Parsing paper")
global number_of_pages
number_of_pages = len(pdf.pages)
print(f"Total number of pages: {number_of_pages}")
paper_text = []
misc_text = []
ismisc = False
for i in range(number_of_pages):
page = pdf.pages[i]
if i == 0:
isfirstpage = True
else:
isfirstpage = False
page_text = []
misc_page_text = []
sentences = []
misc_sentences = []
processed_text = []
misc_processed_text = []
def visitor_body(text, isfirstpage, x, top, bottom, fontSize, ismisc):
if isfirstpage: # first page, directly process the text
sentences.append({
'fontsize': fontSize,
'text': ' ' + text.strip().replace('\x03', ''),
'x': x,
'y': top
})
else: # not first page, rule out headers
if (top > 70 and bottom < 720) and (len(text.strip()) > 1) and not ismisc: # main text region
sentences.append({
'fontsize': fontSize,
'text': ' ' + text.strip().replace('\x03', ''),
'x': x,
'y': top
})
elif (top > 70 and bottom < 720) and (len(text.strip()) > 1) and ismisc:
misc_sentences.append({
'fontsize': fontSize,
'text': ' ' + text.strip().replace('\x03', ''),
'x': x,
'y': top
})
extracted_words = page.extract_words(x_tolerance=1, y_tolerance=3, keep_blank_chars=False, use_text_flow=True, horizontal_ltr=True, vertical_ttb=True, extra_attrs=["fontname", "size"], split_at_punctuation=False)
# Treat the first page, main text, and references differently, specifically targeted at headers
# Define a list of keywords to ignore
keywords_for_misc = ['References', 'REFERENCES', 'Bibliography', 'BIBLIOGRAPHY', 'Acknowledgements', 'ACKNOWLEDGEMENTS', 'Acknowledgments', 'ACKNOWLEDGMENTS', '参考文献', '致谢']
keywords_for_exception = ['R', 'B', 'A']
extracted_words = iter(extracted_words)
# Loop through the extracted words
for extracted_word in extracted_words:
# Strip the text and remove any special characters
text = extracted_word['text'].strip().replace('\x03', '')
if len(text) == 1 and any(keyword in text for keyword in keywords_for_exception):
try:
tmp_text = text + next(extracted_words)['text']
except StopIteration:
pass
# Check if the text contains any of the keywords to ignore
if 'tmp_text' in locals() and any(keyword in tmp_text for keyword in keywords_for_misc):
ismisc = True
# Call the visitor_body function with the relevant arguments
visitor_body(text, isfirstpage, extracted_word['x0'], extracted_word['top'], extracted_word['bottom'], extracted_word['size'], ismisc=ismisc)
if sentences != []:
if len(sentences) == 1:
page_text.append(sentences)
for j in range(len(sentences)-1):
if len(sentences[j]['text']) < 100 and sentences[j]['fontsize'] == sentences[j+1]['fontsize']: # average length of a sentence
sentences[j+1]['text'] = f"{sentences[j]['text']}" + sentences[j+1]['text']
else:
page_text.append(sentences[j])
if ismisc == True:
if len(misc_sentences) == 1:
misc_page_text.append(misc_sentences)
else:
for j in range(len(misc_sentences)-1):
if len(misc_sentences[j]['text']) < 100 and misc_sentences[j]['fontsize'] == misc_sentences[j+1]['fontsize']: # average length of a sentence
misc_sentences[j+1]['text'] = f"{misc_sentences[j]['text']}" + misc_sentences[j+1]['text']
else:
misc_page_text.append(misc_sentences[j])
blob_font_sizes = []
blob_font_size = None
blob_text = ''
processed_text = []
if ismisc == True:
misc_blob_font_sizes = []
misc_blob_font_size = None
misc_blob_text = ''
misc_processed_text = []
tolerance = 1
title_tolerance = 3.6
# Preprocessing for title font size
if isfirstpage:
title_clean = []
title_font_sizes = []
for q in page_text:
title_font_sizes.append(q['fontsize'])
title_font_sizes.sort()
title_font_sizes_max_count = title_font_sizes.count(title_font_sizes[-1])
title_font_sizes_second_max_count = title_font_sizes.count(title_font_sizes[-2])
if title_font_sizes_max_count > title_font_sizes_second_max_count:
title_font_size = title_font_sizes[-1]
else:
title_font_size = title_font_sizes[-2] # title is the word cluster with largest font size in most cases
for r in page_text:
if title_font_size - title_tolerance <= r['fontsize'] <= title_font_size:
title_clean.append(r['text'])
title_sentence = ' '.join(title_clean)
# Preprocessing for main text font size
if page_text != []:
if len(page_text) == 1:
blob_font_sizes.append(page_text[0]['fontsize'])
else:
for t in page_text:
blob_font_sizes.append(t['fontsize'])
blob_font_size = Counter(blob_font_sizes).most_common(1)[0][0]
if ismisc == True:
# Preprocessing for misc text font size
if len(misc_page_text) == 1:
misc_blob_font_sizes.append(misc_page_text[0]['fontsize'])
else:
for s in misc_page_text:
misc_blob_font_sizes.append(s['fontsize'])
misc_blob_font_size = Counter(misc_blob_font_sizes).most_common(1)[0][0]
if page_text != []:
if len(page_text) == 1:
if blob_font_size - tolerance <= page_text[0]['fontsize'] <= blob_font_size + tolerance:
processed_text.append({
'text': page_text[0]['text'],
'page': i+1
})
else:
for t in range(len(page_text)):
if blob_font_size - tolerance <= page_text[t]['fontsize'] <= blob_font_size + tolerance:
blob_text += f"{page_text[t]['text']}"
if len(blob_text) >= 500: # set the length of a data chunk
processed_text.append({
'text': blob_text,
'page': i+1
})
blob_text = ''
elif t == len(page_text)-1: # last element
processed_text.append({
'text': blob_text,
'page': i+1
})
paper_text += processed_text
if ismisc == True:
if len(misc_page_text) == 1:
if misc_blob_font_size - tolerance <= misc_page_text[0]['fontsize'] <= misc_blob_font_size + tolerance:
misc_processed_text.append({
'text': misc_page_text[0]['text'],
'page': i+1
})
else:
for s in range(len(misc_page_text)):
if misc_blob_font_size - tolerance <= misc_page_text[s]['fontsize'] <= misc_blob_font_size + tolerance:
misc_blob_text += f"{misc_page_text[s]['text']}"
if len(misc_blob_text) >= 500: # set the length of a data chunk
misc_processed_text.append({
'text': misc_blob_text,
'page': i+1
})
misc_blob_text = ''
elif s == len(misc_page_text)-1: # last element
misc_processed_text.append({
'text': misc_blob_text,
'page': i+1
})
misc_text += misc_processed_text
print("Done parsing paper")
# title judgement
if len(title_clean) != 1:
if len(title_clean) <= 3:
for title in title_clean:
if len(title) > 30:
title_sentence = title
break
else:
for i in range(3): # try 3 times
if len(title_clean[i]) > 30:
title_sentence = title_clean[i]
break
return paper_text, misc_text, title_sentence
def paper_df(self, pdf):
print('Creating dataframe')
filtered_pdf= []
for row in pdf:
if len(row['text']) < 30:
continue
filtered_pdf.append(row)
df = pd.DataFrame(filtered_pdf)
# print(df.shape)
# remove elements with identical df[text] and df[page] values
df = df.drop_duplicates(subset=['text', 'page'], keep='first')
df['length'] = df['text'].apply(lambda x: len(x))
print('Done creating dataframe')
return df
def calculate_embeddings(self, df):
print('Calculating embeddings')
embeddings = df.text.apply([lambda x: get_embedding(x, engine=openai_embedding_model)])
df["embeddings"] = embeddings
print('Done calculating embeddings')
return df
def search_embeddings(self, df, query, n=3, pprint=True):
query_embedding = get_embedding(
query,
engine=openai_embedding_model
)
df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
results = df.sort_values("similarity", ascending=False, ignore_index=True)
# make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
results = results.head(n).sort_values(by='page')
global sources
sources = []
for i in range(n):
# append the page number and the text as a dict to the sources list
sources.append({'Page '+str(results.iloc[i]['page']): results.iloc[i]['text'][:150]+'...'})
print(sources)
return results.head(n)
def get_title(self, title_related):
system_role = f"""I have a list contains a title of a paper. I want to extract the title of the paper."""
user_content = f"""Given the string of a title and its related information: "{str(title_related)}". Return the title of the paper. Return the title only. Do not return any additional text."""
messages = [
{"role": "system", "content": system_role},
{"role": "user", "content": user_content},]
print('Done generating title information')
return messages
def get_scope(self, user_input):
system_role = f"""You are going to determine if a sentence has a request of requesting detailed information of a research paper. Please note that 'main idea', 'summary', 'abstraction', 'overview' are not detailed information. Also, sentences not containing 'detail' or 'reviewer' are not detailed information and you should return a False value. Return a boolean value only."""
user_content = f"""The sentence is: "{str(user_input)}"."""
messages = [
{"role": "system", "content": system_role},
{"role": "user", "content": user_content},]
return messages
def create_messages(self, df, user_input, title, isdetail, *df_misc):
if df_misc == None:
result_df = df
else:
result_df = df.append(df_misc)
if number_of_pages >= 6 and isdetail == True:
result = self.search_embeddings(result_df, user_input, n=int(math.log2(number_of_pages)) + 2)
elif number_of_pages >= 6:
result = self.search_embeddings(result_df, user_input, n=int(math.log2(number_of_pages)))
else:
if len(df) < 3:
result = self.search_embeddings(result_df, user_input, n=len(df))
else:
result = self.search_embeddings(result_df, user_input, n=3)
total_max_string = 2000
if isdetail == True:
max_string = total_max_string // (int(math.log2(number_of_pages)) + 2)
else:
max_string = total_max_string // int(math.log2(number_of_pages))
embeddings = []
for i in range(int(math.log2(number_of_pages))):
if len(result.iloc[i]['text']) > max_string:
embeddings.append(str(result.iloc[i]['text'][:max_string]))
else:
embeddings.append(str(result.iloc[i]['text']))
def get_gpt_augmentation(query):
response = index.query(query)
return response
gpt_augmentation = get_gpt_augmentation(str(user_input)).response.strip()[:100]
system_role = f"""As an academician, you are tasked with reading and summarizing a scientific paper. You are given text embeddings, the title of the paper, and a prompt sentence. Your goal is to answer any questions related to the paper using the given embeddings and the title. The embeddings are as follows: {str(embeddings)}. The title of the paper is: {title}. The prompt sentence is: {str(gpt_augmentation)}. To ensure that your answers are accurate and relevant, you must answer in the language of the question. If you cannot answer your question, you will apologize and directly say you do not have the information."""
user_content = f"""The question is: "{str(user_input)}"."""
messages = [
{"role": "system", "content": system_role},
{"role": "user", "content": user_content},
]
print('Done creating messages')
return messages
def gpt(self, messages, isdetail):
print('Sending request to ', openai_chat_model)
if openai.api_type == "azure":
model_params = {
"engine": openai_chat_model,
}
else:
model_params = {
"model": openai_chat_model,
}
r = openai.ChatCompletion.create(
**model_params,
messages=messages,
temperature=0.7,
max_tokens=1500)
answer = r.choices[0]["message"]["content"]
print('Done sending request to', openai_chat_model)
if isdetail:
return {'answer': answer.replace("\n", "")}
if 'sources' in globals().keys():
return {'answer': answer, 'sources': sources}
return {'answer': answer.replace("\n", "")}
@app.route("/api/process_pdf", methods=['POST'])
def process_pdf():
print("Processing pdf")
file = request.data
CJKPDFReader = download_loader("CJKPDFReader")
loader = CJKPDFReader()
with open('upload.pdf', 'wb') as f:
f.write(request.data)
global document, index
document = loader.load_data(file=Path('./upload.pdf'))
index = GPTSimpleVectorIndex.from_documents(document)
pdf = pdfplumber.open(BytesIO(file))
chatbot = Chatbot()
paper_text, misc_text, title_related = chatbot.parse_paper(pdf)
global df_main, df_misc
df_main = chatbot.paper_df(paper_text)
df_main = chatbot.calculate_embeddings(df_main)
if misc_text != []:
df_misc = chatbot.paper_df(misc_text)
df_misc = chatbot.calculate_embeddings(df_misc)
title_request = chatbot.get_title(title_related)
global title
pdf = pdfplumber.open(BytesIO(file))
chatbot = Chatbot()
paper_text, misc_text, title_related = chatbot.parse_paper(pdf)
title_request = chatbot.get_title(title_related)
title = chatbot.gpt(title_request, False)
title = title['answer']
print("Done processing pdf")
return {'key': ''}
@app.route("/api/download_pdf", methods=['POST'])
def download_pdf():
chatbot = Chatbot()
CJKPDFReader = download_loader("CJKPDFReader")
loader = CJKPDFReader()
url = request.json['url']
r = requests.get(str(url))
print(r.headers)
with open('upload.pdf', 'wb') as f:
f.write(r.content)
global document, index
document = loader.load_data(file=Path('./upload.pdf'))
index = GPTSimpleVectorIndex.from_documents(document)
pdf = pdfplumber.open(BytesIO(r.content))
paper_text, misc_text, title_related = chatbot.parse_paper(pdf)
global df_main, df_misc
df_main = chatbot.paper_df(paper_text)
df_main = chatbot.calculate_embeddings(df_main)
if misc_text != []:
df_misc = chatbot.paper_df(misc_text)
df_misc = chatbot.calculate_embeddings(df_misc)
title_request = chatbot.get_title(title_related)
global title
title = chatbot.gpt(title_request, False)
title = title['answer']
print("Done processing pdf")
return {'key': ''}
@app.route("/api/reply", methods=['POST'])
def reply():
chatbot = Chatbot()
query = request.json['query']
query = str(query)
scope_request = chatbot.get_scope(query)
scope = chatbot.gpt(scope_request, True)
if 'true' in scope['answer'] or 'True' in scope['answer'] or 'yes' in scope['answer'] or 'Yes' in scope['answer']:
scope = True
else:
scope = False
print('Scope: ', scope)
# Define a list of keywords to match
keywords_to_match = ['reference', 'references', 'cite', 'citation', 'citations', 'cited', 'citing', 'acknowledgment', 'acknowledgements', 'appendix', 'appendices', '参考文献', '致谢', '附录']
# Check if the query matches any of the keywords and if 'df_misc' is defined
if any(keyword in query for keyword in keywords_to_match) and 'df_misc' in globals():
# If the query matches and 'df_misc' is defined, create messages with 'df_misc'
messages = chatbot.create_messages(df_main, query, title, True, df_misc)
elif scope and 'df_misc' in globals():
# If 'scope' is True and 'df_misc' is defined, create messages with 'df_misc'
messages = chatbot.create_messages(df_main, query, title, True, df_misc)
elif scope:
# If 'scope' is True but 'df_misc' is not defined, create messages without 'df_misc'
messages = chatbot.create_messages(df_main, query, title, True)
else:
# If none of the above conditions are met, create messages without 'df_misc'
messages = chatbot.create_messages(df_main, query, title, False)
# Call the gpt function with the messages and False as arguments
response = chatbot.gpt(messages, False)
print(response)
return response, 200
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def catch_all(path):
return render_template('index.html')
if __name__ == '__main__':
app.run(host='0.0.0.0', port=os.getenv("PORT"), debug=True)