-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep.py
197 lines (157 loc) · 5.56 KB
/
prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#to load all/a subset of files, uncomment the lines that call processfiles()
#to index in elastic, uncomment the elastic_index line at 111
#the doc2vec model is based on the abstracts in metadata.csv - does not need all files
#to rebuild it, delete it and run this file
from helpers import *
from constants import *
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
def loaddocs():
for file in tqdm(all_files):
#Not all filetypes have abstracts! E.g. expert reviews MIGHT crash now and should just be abstr then
abstr = [{'text':''}]
if 'abstract' in file:
abstr = format_body(file['abstract'])
features = [
file['paper_id'],
file['metadata']['title'],
format_authors(file['metadata']['authors']),
format_authors(file['metadata']['authors'],
with_affiliation=True),
abstr,
format_body(file['body_text']),
format_bib(file['bib_entries']),
file['metadata']['authors'],
file['bib_entries']
]
cleaned_files.append(features)
def elastic_generator(indexname):
for file in tqdm(cleaned_files):
yield {
"_index": indexname,
"_type": "_doc",
"_id" : file[0],
"_source": {
"title":file[1],
"abstract":file[4],
"fulltext":file[5]
}
}
def elastic_index(indexname):
helpers.bulk(es_client, elastic_generator(indexname))
metadatana = []
def preparedoc2vec(fname, data):
#Check if a model exists
if(os.path.isfile(fname)):
print("Loaded doc2vec model " + fname)
model = Doc2Vec.load(fname)
else:
print("Training doc2vec model " + fname)
#Remove items with bad abstracts
docs = data[~data.abstract.isin(["Unknown", "unknown", ""])]
#remove items with mising abstracts
docs = data[~data.abstract.isnull()]
metadatana = docs
docvals = docs['abstract']
docvals = docvals.values.tolist()
documents = [TaggedDocument(gensim.parsing.preprocess_string(doc), [i]) for i, doc in enumerate(docvals)]
#print('NUMBER OF DOCS ' + str(np.array(documents)))
#this used to be trained on the processedfiles, but abstract is available in metadata in new version
#[TaggedDocument(doc[4], [i]) for i, doc in enumerate(cleaned_files)]
#print("Sanity check: is this an abstract?")
#print(documents[0])
model = gensim.models.doc2vec.Doc2Vec(dm=1, vector_size=100, min_count=2, epochs=20, seed=42, workers=3)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
model.save(fname)
return model
#process/read/clean all files
#also for indexing the files
all_files = []
cleaned_files = []
def processfiles(readname):
readdir = './data/' + readname + "/"
filenames = os.listdir(readdir)
print("Number of articles retrieved from " + readname + ":", len(filenames))
for filename in filenames:
filename = readdir + filename
file = json.load(open(filename, 'rb'))
all_files.append(file)
#load and clean the documents
loaddocs()
#index it in elastic
#elastic_index(indexName)
def prepTREC(fname):
#get valid TREC ids for this round
TRECids = []
f = open(fname)
for line in f.readlines():
if line[-1] == '\n':
line = line[:-1]
TRECids.append(line)
f.close()
metadata = pd.read_csv("./data/metadata.csv")
#now we filter all TREC ids we don't need
#print(np.array(metadata).shape)
#print(np.array(TRECids).shape)
metadata = metadata[metadata.cord_uid.isin(TRECids)]
#there's 33 duplicates (same paper from multiple sources)
metadata.drop_duplicates(subset='cord_uid', keep='first', inplace=True)
return metadata
#Used to store/load scores of docs to the tasks
def docToTaskScores(fname):
scores = []
if(os.path.isfile(fname)):
print("Loaded document-to-task scores from " + fname)
for line in open(fname):
scores.append(line.replace("\n", "").split(" "))
# print(scores)
else:
print("Storing document-to-task scores in " + fname)
f = open(fname, "a")
for file in cleaned_files:
newline = str(file[0])
for index, task in enumerate(list_of_tasks):
#If there is no abstract, we say it's very far away for now.
#Based on assumption that these are less valuable
#Alternative: use the last 200 words
#print(" ".join(file[5].split(" ")[-200:]))
dist = np.linalg.norm(taskvectors[index]-get_doc_vector(model, " ".join(file[5].split(" ")[-200:])))
newline += " " + str(dist)
f.write(newline + "\n")
scores.append(newline.split(" "))
f.close()
return scores
def get_doc_vector(model, doc):
tokens = gensim.parsing.preprocess_string(doc)
vector = model.infer_vector(tokens)
return vector
#set up elastic
indexName = "4-10-covid"
es_client = Elasticsearch(http_compress=True)
#Process the files: clean them and index them
#processfiles("biorxiv_medrxiv/pdf_json")
#processfiles("comm_use_subset/pmc_json")
#processfiles("comm_use_subset/pdf_json")
#processfiles("custom_license/pmc_json")
#processfiles("custom_license/pdf_json")
#processfiles("noncomm_use_subset/pmc_json")
#processfiles("noncomm_use_subset/pdf_json")
metadata = prepTREC('./docids-rnd1.txt')
#Now we train or load a doc2vec model
model = preparedoc2vec("./covid-doc2vec.model", metadata)
#Now we compute the distance of each doc to each task vector, I guess
#And store that in docscores
#create a list of the taskvectors
taskvectors = []
for task in list_of_tasks:
taskvectors.append(get_doc_vector(model, task))
#doc2task scores are not used anymore; were used to find the 1000 docs closest to a task
#docToTaskScores('docscores')