Skip to content

Commit eabc9df

Browse files
Merge pull request #95 from lambda-science/dev
Dev
2 parents e6eac6a + 02d9160 commit eabc9df

File tree

13 files changed

+1614
-1468
lines changed

13 files changed

+1614
-1468
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,4 @@ docker/run.sh
177177
*.tar.gz
178178
IMPatienT
179179
!data/images/demo_patient
180+
.idea

app/historeport/ocr.py

+49-45
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from flask import current_app
1111
from pdf2image import convert_from_bytes
1212
from thefuzz import fuzz
13+
from textacy.extract.basics import ngrams
1314

1415

1516
class TextReport:
@@ -30,6 +31,7 @@ def __init__(self, file_obj, lang):
3031
self.image_stack = []
3132
self.raw_text = ""
3233
self.text_as_list = []
34+
self.sentence_as_list = []
3335
self.header_text = []
3436
if self.lang == "fra":
3537
self.nlp = spacy.load("fr_core_news_sm")
@@ -39,7 +41,7 @@ def __init__(self, file_obj, lang):
3941
self.nlp = spacy.load("en_core_web_sm")
4042
self.negexlist = current_app.config["NEGEX_LIST_EN"]
4143
self.negex_sent = current_app.config["NEGEX_SENT_EN"]
42-
44+
self.all_stopwords = self.nlp.Defaults.stop_words
4345
self.results_match_dict = {}
4446

4547
def get_grayscale(self, image):
@@ -109,7 +111,8 @@ def _detect_negation(self, sentence: str) -> list:
109111
sent = self.nlp(sentence)
110112
for negex_term in self.negexlist:
111113
if len(negex_term.split(" ")) == 1:
112-
for i in sent.text.lower().split(" "):
114+
token_list = [word.text.lower() for word in sent if word.is_alpha]
115+
for i in token_list:
113116
if i == negex_term:
114117
return sent, True
115118
else:
@@ -127,8 +130,8 @@ def _split_sentence(self, sent_original: object) -> list:
127130
list: list of sub-sentences from the original sentence
128131
"""
129132
for sent_sep in self.negex_sent:
130-
if sent_sep in sent_original.text:
131-
sent_list = sent_original.text.split(sent_sep)
133+
if sent_sep.lower() in sent_original.text.lower():
134+
sent_list = sent_original.text.lower().split(sent_sep)
132135
break
133136
else:
134137
sent_list = [sent_original.text]
@@ -144,44 +147,21 @@ def _spacy_ngrams(self, text_section: str) -> dict:
144147
dict: all n-grams detected with negation boolean
145148
"""
146149
doc = self.nlp(text_section)
147-
final_one_ngrams = []
148-
final_two_ngrams = []
149-
final_three_ngrams = []
150+
full_ngrams = []
150151
for sent_original in doc.sents:
152+
self.sentence_as_list.append(sent_original.text)
151153
sent_list = self._split_sentence(sent_original)
154+
# Detect negation in sentence part and extract n-grams up to 6 words
152155
for sent_str in sent_list:
156+
n_gram_size = []
153157
sent, flag_neg = self._detect_negation(sent_str)
154-
temp_token_list = []
155-
for token in sent:
156-
# if not token.is_stop and not token.is_punct and token.is_alpha:
157-
if not token.is_punct and token.is_alpha:
158-
final_one_ngrams.append(
159-
[token.text.lower(), 0 if flag_neg else 1]
160-
)
161-
temp_token_list.append(token.text.lower())
162-
if len(temp_token_list) > 1:
163-
for i in range(len(temp_token_list) - 1):
164-
final_two_ngrams.append(
165-
[
166-
" ".join([temp_token_list[i], temp_token_list[i + 1]]),
167-
0 if flag_neg else 1,
168-
]
169-
)
170-
if len(temp_token_list) > 2:
171-
for i in range(len(temp_token_list) - 2):
172-
final_three_ngrams.append(
173-
[
174-
" ".join(
175-
[
176-
temp_token_list[i],
177-
temp_token_list[i + 1],
178-
temp_token_list[i + 2],
179-
]
180-
),
181-
0 if flag_neg else 1,
182-
]
183-
)
184-
full_ngrams = final_one_ngrams + final_two_ngrams + final_three_ngrams
158+
ngrams_generator = ngrams(sent, (1, 2, 3, 4, 5, 6), filter_punct=True)
159+
for i in ngrams_generator:
160+
pos_ngrams = " ".join(self.sentence_as_list).find(i.text)
161+
full_ngrams.append(
162+
(i.text.lower(), 0 if flag_neg else 1, pos_ngrams)
163+
)
164+
185165
return full_ngrams
186166

187167
def _match_ngram_ontology(self, full_ngrams) -> list:
@@ -202,14 +182,38 @@ def _match_ngram_ontology(self, full_ngrams) -> list:
202182
ontology_terms.append([i["id"], i["text"]])
203183
for synonym in i["data"]["synonymes"].split(","):
204184
ontology_terms.append([i["id"], synonym])
205-
for i in full_ngrams:
206-
for j in ontology_terms:
207-
score = fuzz.ratio(i[0].lower(), j[1].lower())
208-
if score >= 80:
209-
# [neg_flag, ngram, match_term, node_id]
210-
match_list.append([i[1], i[0], j[1], j[0]])
185+
186+
n_grams_words = [i[0] for i in full_ngrams]
187+
onto_words = [i[1] for i in ontology_terms]
188+
full_ngrams_processed = self._lemmatize_list(n_grams_words)
189+
full_onto_processed = self._lemmatize_list(onto_words)
190+
for n_gram_index, i in enumerate(full_ngrams_processed):
191+
for onto_index, j in enumerate(full_onto_processed):
192+
score = fuzz.ratio(i.lower(), j.lower())
193+
if score >= 85:
194+
# [neg_flag, ngram, match_term, node_id, score, match pos in string]
195+
match_list.append(
196+
[
197+
full_ngrams[n_gram_index][1],
198+
i,
199+
j,
200+
ontology_terms[onto_index][0],
201+
score,
202+
full_ngrams[n_gram_index][2],
203+
]
204+
)
211205
return match_list
212206

207+
def _lemmatize_list(self, list_ngrams: list) -> list:
208+
result_list = []
209+
for elm in list_ngrams:
210+
result = self.nlp(elm, disable=["tok2vec", "parser", "ner"])
211+
sent_no_stop = " ".join(
212+
[word.lemma_ for word in result if not word.is_stop]
213+
)
214+
result_list.append(sent_no_stop)
215+
return result_list
216+
213217
def analyze_text(self) -> list:
214218
"""Analyse the whole text of the PDF and match it to the standard vocabulary
215219
@@ -218,7 +222,7 @@ def analyze_text(self) -> list:
218222
First value is the neg flag, second value is the ngram, third value
219223
is the matching terms, last value is the node ID.
220224
"""
221-
full_ngrams = self._spacy_ngrams(self.raw_text)
225+
full_ngrams = self._spacy_ngrams(self.raw_text.replace("\n", " "))
222226
match_list = self._match_ngram_ontology(full_ngrams)
223227
return match_list
224228

app/historeport/routes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def ocr_pdf():
251251
# pdf_object.detect_sections()
252252
# pdf_object.extract_section_text()
253253
match_list = pdf_object.analyze_text()
254-
results = {"full_text": pdf_object.text_as_list, "match_list": match_list}
254+
results = {"full_text": pdf_object.sentence_as_list, "match_list": match_list}
255255
return (
256256
json.dumps({"success": True, "results": results}),
257257
200,

app/historeport/static/historeport.js

+17-9
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ var input4_tag = new Tagify(input4);
1313
var input5 = document.querySelector("input[id=phenotype_datamined]");
1414
var input5_tag = new Tagify(input5);
1515
var input6 = document.querySelector("input[id=alternative_language]");
16-
var input6_tag = new Tagify(input6);
16+
var input6_tag = new Tagify(input6, { maxTags: 1 });
1717
var input7 = document.querySelector("input[id=correlates_with]");
1818
var input7_tag = new Tagify(input7);
1919

@@ -442,8 +442,10 @@ $(function () {
442442
let absent_feat_overview_auto = document.getElementById(
443443
"feature-absent-auto"
444444
);
445-
present_feat_overview_auto.innerHTML = "";
446-
absent_feat_overview_auto.innerHTML = "";
445+
present_feat_overview_auto.innerHTML =
446+
"ID | Vocab. Term | Pos. in Text | Text | Score<br />";
447+
absent_feat_overview_auto.innerHTML =
448+
"ID | Vocab. Term | Pos. in Text | Text | Score<br />";
447449

448450
// For each entires in our match list add to corresponding accordion
449451
for (const [key, value] of Object.entries(
@@ -453,21 +455,27 @@ $(function () {
453455
present_feat_overview_auto.innerHTML +=
454456
"<span style='color:green'>" +
455457
value[3] +
456-
" " +
458+
" | " +
457459
value[2] +
458-
" (" +
460+
" | " +
461+
value[5] +
462+
" | " +
459463
value[1] +
460-
")" +
464+
" | " +
465+
value[4] +
461466
"</span><br />";
462467
} else if (value[0] == 0) {
463468
absent_feat_overview_auto.innerHTML +=
464469
"<span style='color:red'>" +
465470
value[3] +
466-
" " +
471+
" | " +
467472
value[2] +
468-
" (" +
473+
" | " +
474+
value[5] +
475+
" | " +
469476
value[1] +
470-
")" +
477+
" | " +
478+
value[4] +
471479
"</span><br />";
472480
}
473481

app/historeport/templates/historeport.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ <h2 class="accordion-header" id="heading1">
8181
<h2 class="accordion-header" id="heading2">
8282
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse"
8383
data-bs-target="#collapse2" aria-expanded="true" aria-controls="collapse2">
84-
Vocabulary Automatically Detected as Present
84+
Vocabulary Automatically Annotated (Present)
8585
</button>
8686
</h2>
8787
<div id="collapse2" class="accordion-collapse collapse" aria-labelledby="heading2"
@@ -265,4 +265,4 @@ <h4>Commentaries and Conclusions</h4>
265265
<meta id="data-url" data-boqa="{{url_for('historeport.predict_diag_boqa') }}">
266266
<script src="{{ url_for('historeport.static', filename='historeport.js') }}"></script>
267267

268-
{% endblock %}
268+
{% endblock %}

app/ontocreate/static/ontocreate.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ var input4_tag = new Tagify(input4);
1313
var input5 = document.querySelector("input[id=phenotype_datamined]");
1414
var input5_tag = new Tagify(input5);
1515
var input6 = document.querySelector("input[id=alternative_language]");
16-
var input6_tag = new Tagify(input6);
16+
var input6_tag = new Tagify(input6, { maxTags: 1 });
1717
var input7 = document.querySelector("input[id=correlates_with]");
1818
var input7_tag = new Tagify(input7);
1919

data/database/app.db.demo

4 KB
Binary file not shown.

0 commit comments

Comments
 (0)