1010from flask import current_app
1111from pdf2image import convert_from_bytes
1212from thefuzz import fuzz
13+ from textacy .extract .basics import ngrams
1314
1415
1516class TextReport :
@@ -30,6 +31,7 @@ def __init__(self, file_obj, lang):
3031 self .image_stack = []
3132 self .raw_text = ""
3233 self .text_as_list = []
34+ self .sentence_as_list = []
3335 self .header_text = []
3436 if self .lang == "fra" :
3537 self .nlp = spacy .load ("fr_core_news_sm" )
@@ -39,7 +41,7 @@ def __init__(self, file_obj, lang):
3941 self .nlp = spacy .load ("en_core_web_sm" )
4042 self .negexlist = current_app .config ["NEGEX_LIST_EN" ]
4143 self .negex_sent = current_app .config ["NEGEX_SENT_EN" ]
42-
44+ self . all_stopwords = self . nlp . Defaults . stop_words
4345 self .results_match_dict = {}
4446
4547 def get_grayscale (self , image ):
@@ -109,7 +111,8 @@ def _detect_negation(self, sentence: str) -> list:
109111 sent = self .nlp (sentence )
110112 for negex_term in self .negexlist :
111113 if len (negex_term .split (" " )) == 1 :
112- for i in sent .text .lower ().split (" " ):
114+ token_list = [word .text .lower () for word in sent if word .is_alpha ]
115+ for i in token_list :
113116 if i == negex_term :
114117 return sent , True
115118 else :
@@ -127,8 +130,8 @@ def _split_sentence(self, sent_original: object) -> list:
127130 list: list of sub-sentences from the original sentence
128131 """
129132 for sent_sep in self .negex_sent :
130- if sent_sep in sent_original .text :
131- sent_list = sent_original .text .split (sent_sep )
133+ if sent_sep . lower () in sent_original .text . lower () :
134+ sent_list = sent_original .text .lower (). split (sent_sep )
132135 break
133136 else :
134137 sent_list = [sent_original .text ]
@@ -144,44 +147,21 @@ def _spacy_ngrams(self, text_section: str) -> dict:
144147 dict: all n-grams detected with negation boolean
145148 """
146149 doc = self .nlp (text_section )
147- final_one_ngrams = []
148- final_two_ngrams = []
149- final_three_ngrams = []
150+ full_ngrams = []
150151 for sent_original in doc .sents :
152+ self .sentence_as_list .append (sent_original .text )
151153 sent_list = self ._split_sentence (sent_original )
154+ # Detect negation in sentence part and extract n-grams up to 6 words
152155 for sent_str in sent_list :
156+ n_gram_size = []
153157 sent , flag_neg = self ._detect_negation (sent_str )
154- temp_token_list = []
155- for token in sent :
156- # if not token.is_stop and not token.is_punct and token.is_alpha:
157- if not token .is_punct and token .is_alpha :
158- final_one_ngrams .append (
159- [token .text .lower (), 0 if flag_neg else 1 ]
160- )
161- temp_token_list .append (token .text .lower ())
162- if len (temp_token_list ) > 1 :
163- for i in range (len (temp_token_list ) - 1 ):
164- final_two_ngrams .append (
165- [
166- " " .join ([temp_token_list [i ], temp_token_list [i + 1 ]]),
167- 0 if flag_neg else 1 ,
168- ]
169- )
170- if len (temp_token_list ) > 2 :
171- for i in range (len (temp_token_list ) - 2 ):
172- final_three_ngrams .append (
173- [
174- " " .join (
175- [
176- temp_token_list [i ],
177- temp_token_list [i + 1 ],
178- temp_token_list [i + 2 ],
179- ]
180- ),
181- 0 if flag_neg else 1 ,
182- ]
183- )
184- full_ngrams = final_one_ngrams + final_two_ngrams + final_three_ngrams
158+ ngrams_generator = ngrams (sent , (1 , 2 , 3 , 4 , 5 , 6 ), filter_punct = True )
159+ for i in ngrams_generator :
160+ pos_ngrams = " " .join (self .sentence_as_list ).find (i .text )
161+ full_ngrams .append (
162+ (i .text .lower (), 0 if flag_neg else 1 , pos_ngrams )
163+ )
164+
185165 return full_ngrams
186166
187167 def _match_ngram_ontology (self , full_ngrams ) -> list :
@@ -202,14 +182,38 @@ def _match_ngram_ontology(self, full_ngrams) -> list:
202182 ontology_terms .append ([i ["id" ], i ["text" ]])
203183 for synonym in i ["data" ]["synonymes" ].split ("," ):
204184 ontology_terms .append ([i ["id" ], synonym ])
205- for i in full_ngrams :
206- for j in ontology_terms :
207- score = fuzz .ratio (i [0 ].lower (), j [1 ].lower ())
208- if score >= 80 :
209- # [neg_flag, ngram, match_term, node_id]
210- match_list .append ([i [1 ], i [0 ], j [1 ], j [0 ]])
185+
186+ n_grams_words = [i [0 ] for i in full_ngrams ]
187+ onto_words = [i [1 ] for i in ontology_terms ]
188+ full_ngrams_processed = self ._lemmatize_list (n_grams_words )
189+ full_onto_processed = self ._lemmatize_list (onto_words )
190+ for n_gram_index , i in enumerate (full_ngrams_processed ):
191+ for onto_index , j in enumerate (full_onto_processed ):
192+ score = fuzz .ratio (i .lower (), j .lower ())
193+ if score >= 85 :
194+ # [neg_flag, ngram, match_term, node_id, score, match pos in string]
195+ match_list .append (
196+ [
197+ full_ngrams [n_gram_index ][1 ],
198+ i ,
199+ j ,
200+ ontology_terms [onto_index ][0 ],
201+ score ,
202+ full_ngrams [n_gram_index ][2 ],
203+ ]
204+ )
211205 return match_list
212206
207+ def _lemmatize_list (self , list_ngrams : list ) -> list :
208+ result_list = []
209+ for elm in list_ngrams :
210+ result = self .nlp (elm , disable = ["tok2vec" , "parser" , "ner" ])
211+ sent_no_stop = " " .join (
212+ [word .lemma_ for word in result if not word .is_stop ]
213+ )
214+ result_list .append (sent_no_stop )
215+ return result_list
216+
213217 def analyze_text (self ) -> list :
214218 """Analyse the whole text of the PDF and match it to the standard vocabulary
215219
@@ -218,7 +222,7 @@ def analyze_text(self) -> list:
218222 First value is the neg flag, second value is the ngram, third value
219223 is the matching terms, last value is the node ID.
220224 """
221- full_ngrams = self ._spacy_ngrams (self .raw_text )
225+ full_ngrams = self ._spacy_ngrams (self .raw_text . replace ( " \n " , " " ) )
222226 match_list = self ._match_ngram_ontology (full_ngrams )
223227 return match_list
224228
0 commit comments