1
+ '''
2
+ Given a list of sentence with same topics (or from a news), this function will return
3
+ a text which have score similiar between sentences. It will be useful to find the main sentence.
4
+ '''
5
+ import numpy as np
6
+ def jaccard_similarity (str_1 , str_2 ):
7
+ """ compute of intersection to get similraity score between words """
8
+ str_1 = set (str_1 .split ())
9
+ str_2 = set (str_2 .split ())
10
+ intersect = str_1 .intersection (str_2 )
11
+ return float (len (intersect )) / (len (str_1 ) + len (str_2 ) - len (intersect ))
12
+
13
+ def max_avg_jaccard_sim (sentence_list , show_avg = False ):
14
+ """ compute of intersection each sentence in cluster, and return sentence with maximum of average similarity score between sentence """
15
+ sim = []
16
+ text_avg_sim = {}
17
+ for idx in range (len (sentence_list )):
18
+ for text in sentence_list :
19
+ if len (text ) < 2 :
20
+ continue
21
+ similarity = jaccard_similarity (sentence_list [idx ], text )
22
+ sim .append (similarity )
23
+ text_avg_sim [sentence_list [idx ]] = sum (sim ) / len (sim )
24
+
25
+ # key of max values
26
+ if show_avg :
27
+ return max (text_avg_sim , key = text_avg_sim .get ), max (text_avg_sim .values ())
28
+ else :
29
+ return max (text_avg_sim , key = text_avg_sim .get )
30
+
31
+
32
+ if __name__ == "__main__" :
33
+ sentences = ["Manchester United midfielder Scott McTominay secured the Man-of-the Match award after the Reds’ 3-1 win at Norwich City." ,
34
+ "The 21-year-old took home 50 per cent of your vote after netting our 21st-minute opener – which was our 2,000th strike in Premier League history." ,
35
+ "The Scotland international said he was delighted with the team's performance, as well as securing his place in the record books." ]
36
+ main_sent = max_avg_jaccard_sim (sentence_list = sentences )
37
+ print (main_sent )
0 commit comments