Skip to content

Commit ad2a72c

Browse files
Merge pull request #722 from swicaksono/master
added djaccard similiarity algorithm
2 parents 5dfcd75 + 024e520 commit ad2a72c

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

Python/djaccard_simmiliarity.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
'''
2+
Given a list of sentence with same topics (or from a news), this function will return
3+
a text which have score similiar between sentences. It will be useful to find the main sentence.
4+
'''
5+
import numpy as np
6+
def jaccard_similarity(str_1, str_2):
7+
""" compute of intersection to get similraity score between words """
8+
str_1 = set(str_1.split())
9+
str_2 = set(str_2.split())
10+
intersect = str_1.intersection(str_2)
11+
return float(len(intersect)) / (len(str_1) + len(str_2) - len(intersect))
12+
13+
def max_avg_jaccard_sim(sentence_list, show_avg=False):
14+
""" compute of intersection each sentence in cluster, and return sentence with maximum of average similarity score between sentence """
15+
sim = []
16+
text_avg_sim = {}
17+
for idx in range(len(sentence_list)):
18+
for text in sentence_list:
19+
if len(text) < 2:
20+
continue
21+
similarity = jaccard_similarity(sentence_list[idx], text)
22+
sim.append(similarity)
23+
text_avg_sim[sentence_list[idx]] = sum(sim) / len(sim)
24+
25+
# key of max values
26+
if show_avg:
27+
return max(text_avg_sim, key=text_avg_sim.get), max(text_avg_sim.values())
28+
else:
29+
return max(text_avg_sim, key=text_avg_sim.get)
30+
31+
32+
if __name__ == "__main__":
33+
sentences = ["Manchester United midfielder Scott McTominay secured the Man-of-the Match award after the Reds’ 3-1 win at Norwich City.",
34+
"The 21-year-old took home 50 per cent of your vote after netting our 21st-minute opener – which was our 2,000th strike in Premier League history.",
35+
"The Scotland international said he was delighted with the team's performance, as well as securing his place in the record books."]
36+
main_sent = max_avg_jaccard_sim(sentence_list=sentences)
37+
print (main_sent)

0 commit comments

Comments
 (0)