10
10
from flask import current_app
11
11
from pdf2image import convert_from_bytes
12
12
from thefuzz import fuzz
13
+ from textacy .extract .basics import ngrams
13
14
14
15
15
16
class TextReport :
@@ -30,6 +31,7 @@ def __init__(self, file_obj, lang):
30
31
self .image_stack = []
31
32
self .raw_text = ""
32
33
self .text_as_list = []
34
+ self .sentence_as_list = []
33
35
self .header_text = []
34
36
if self .lang == "fra" :
35
37
self .nlp = spacy .load ("fr_core_news_sm" )
@@ -39,7 +41,7 @@ def __init__(self, file_obj, lang):
39
41
self .nlp = spacy .load ("en_core_web_sm" )
40
42
self .negexlist = current_app .config ["NEGEX_LIST_EN" ]
41
43
self .negex_sent = current_app .config ["NEGEX_SENT_EN" ]
42
-
44
+ self . all_stopwords = self . nlp . Defaults . stop_words
43
45
self .results_match_dict = {}
44
46
45
47
def get_grayscale (self , image ):
@@ -109,7 +111,8 @@ def _detect_negation(self, sentence: str) -> list:
109
111
sent = self .nlp (sentence )
110
112
for negex_term in self .negexlist :
111
113
if len (negex_term .split (" " )) == 1 :
112
- for i in sent .text .lower ().split (" " ):
114
+ token_list = [word .text .lower () for word in sent if word .is_alpha ]
115
+ for i in token_list :
113
116
if i == negex_term :
114
117
return sent , True
115
118
else :
@@ -127,8 +130,8 @@ def _split_sentence(self, sent_original: object) -> list:
127
130
list: list of sub-sentences from the original sentence
128
131
"""
129
132
for sent_sep in self .negex_sent :
130
- if sent_sep in sent_original .text :
131
- sent_list = sent_original .text .split (sent_sep )
133
+ if sent_sep . lower () in sent_original .text . lower () :
134
+ sent_list = sent_original .text .lower (). split (sent_sep )
132
135
break
133
136
else :
134
137
sent_list = [sent_original .text ]
@@ -144,44 +147,21 @@ def _spacy_ngrams(self, text_section: str) -> dict:
144
147
dict: all n-grams detected with negation boolean
145
148
"""
146
149
doc = self .nlp (text_section )
147
- final_one_ngrams = []
148
- final_two_ngrams = []
149
- final_three_ngrams = []
150
+ full_ngrams = []
150
151
for sent_original in doc .sents :
152
+ self .sentence_as_list .append (sent_original .text )
151
153
sent_list = self ._split_sentence (sent_original )
154
+ # Detect negation in sentence part and extract n-grams up to 6 words
152
155
for sent_str in sent_list :
156
+ n_gram_size = []
153
157
sent , flag_neg = self ._detect_negation (sent_str )
154
- temp_token_list = []
155
- for token in sent :
156
- # if not token.is_stop and not token.is_punct and token.is_alpha:
157
- if not token .is_punct and token .is_alpha :
158
- final_one_ngrams .append (
159
- [token .text .lower (), 0 if flag_neg else 1 ]
160
- )
161
- temp_token_list .append (token .text .lower ())
162
- if len (temp_token_list ) > 1 :
163
- for i in range (len (temp_token_list ) - 1 ):
164
- final_two_ngrams .append (
165
- [
166
- " " .join ([temp_token_list [i ], temp_token_list [i + 1 ]]),
167
- 0 if flag_neg else 1 ,
168
- ]
169
- )
170
- if len (temp_token_list ) > 2 :
171
- for i in range (len (temp_token_list ) - 2 ):
172
- final_three_ngrams .append (
173
- [
174
- " " .join (
175
- [
176
- temp_token_list [i ],
177
- temp_token_list [i + 1 ],
178
- temp_token_list [i + 2 ],
179
- ]
180
- ),
181
- 0 if flag_neg else 1 ,
182
- ]
183
- )
184
- full_ngrams = final_one_ngrams + final_two_ngrams + final_three_ngrams
158
+ ngrams_generator = ngrams (sent , (1 , 2 , 3 , 4 , 5 , 6 ), filter_punct = True )
159
+ for i in ngrams_generator :
160
+ pos_ngrams = " " .join (self .sentence_as_list ).find (i .text )
161
+ full_ngrams .append (
162
+ (i .text .lower (), 0 if flag_neg else 1 , pos_ngrams )
163
+ )
164
+
185
165
return full_ngrams
186
166
187
167
def _match_ngram_ontology (self , full_ngrams ) -> list :
@@ -202,14 +182,38 @@ def _match_ngram_ontology(self, full_ngrams) -> list:
202
182
ontology_terms .append ([i ["id" ], i ["text" ]])
203
183
for synonym in i ["data" ]["synonymes" ].split ("," ):
204
184
ontology_terms .append ([i ["id" ], synonym ])
205
- for i in full_ngrams :
206
- for j in ontology_terms :
207
- score = fuzz .ratio (i [0 ].lower (), j [1 ].lower ())
208
- if score >= 80 :
209
- # [neg_flag, ngram, match_term, node_id]
210
- match_list .append ([i [1 ], i [0 ], j [1 ], j [0 ]])
185
+
186
+ n_grams_words = [i [0 ] for i in full_ngrams ]
187
+ onto_words = [i [1 ] for i in ontology_terms ]
188
+ full_ngrams_processed = self ._lemmatize_list (n_grams_words )
189
+ full_onto_processed = self ._lemmatize_list (onto_words )
190
+ for n_gram_index , i in enumerate (full_ngrams_processed ):
191
+ for onto_index , j in enumerate (full_onto_processed ):
192
+ score = fuzz .ratio (i .lower (), j .lower ())
193
+ if score >= 85 :
194
+ # [neg_flag, ngram, match_term, node_id, score, match pos in string]
195
+ match_list .append (
196
+ [
197
+ full_ngrams [n_gram_index ][1 ],
198
+ i ,
199
+ j ,
200
+ ontology_terms [onto_index ][0 ],
201
+ score ,
202
+ full_ngrams [n_gram_index ][2 ],
203
+ ]
204
+ )
211
205
return match_list
212
206
207
+ def _lemmatize_list (self , list_ngrams : list ) -> list :
208
+ result_list = []
209
+ for elm in list_ngrams :
210
+ result = self .nlp (elm , disable = ["tok2vec" , "parser" , "ner" ])
211
+ sent_no_stop = " " .join (
212
+ [word .lemma_ for word in result if not word .is_stop ]
213
+ )
214
+ result_list .append (sent_no_stop )
215
+ return result_list
216
+
213
217
def analyze_text (self ) -> list :
214
218
"""Analyse the whole text of the PDF and match it to the standard vocabulary
215
219
@@ -218,7 +222,7 @@ def analyze_text(self) -> list:
218
222
First value is the neg flag, second value is the ngram, third value
219
223
is the matching terms, last value is the node ID.
220
224
"""
221
- full_ngrams = self ._spacy_ngrams (self .raw_text )
225
+ full_ngrams = self ._spacy_ngrams (self .raw_text . replace ( " \n " , " " ) )
222
226
match_list = self ._match_ngram_ontology (full_ngrams )
223
227
return match_list
224
228
0 commit comments