-
Notifications
You must be signed in to change notification settings - Fork 480
/
Copy pathsimilarity_chars.py
executable file
·80 lines (73 loc) · 2.85 KB
/
similarity_chars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Scott Hendrickson
# 2011-09-15
# Compares by letters
# See similarityWords.py to compare by words
import codecs
import sys
#reload(sys)
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
#sys.stdin = codecs.getreader('utf-8')(sys.stdin)
class SS2Similar(object):
def __init__(self, _maxOffset):
self.maxOffset = _maxOffset
def distance(self, s1, s2):
if s1 is None or s1 == '':
if s2 is None or s2 == '':
return 0.
else:
return len(s2)
if s2 is None or s2 == '':
return len(s1)
c = 0
offset1 = 0
offset2 = 0
dist = 0
while c + offset1 < len(s1) and c + offset2 < len(s2):
if s1[c + offset1] != s2[c + offset2]:
offset1 = 0
offset2 = 0
for i in range(0, self.maxOffset):
if c + i < len(s1) and s1[c + i] == s2[c]:
if i > 0:
dist += 1
offset1 = i
break
if c + i < len(s2) and s1[c] == s2[c + i]:
if i > 0:
dist += 1
offset2 = i
break
else:
dist += 1
c += 1
return dist + (len(s1) - offset1 + len(s2) - offset2)/2. - c
def similarity(self, s1, s2):
dis = self.distance(s1, s2);
maxLen = max(len(s1), len(s2))
if maxLen == 0:
return 1.
else:
return 1. - dis / maxLen
if __name__ == '__main__':
# this set is classified fairly well at shift:30 and score > 0.5
strings = [
"On the topic of Strata Talk. If you live in a stratified building, this website might be of interest to you. http://t.co/SWEUxBCP"
, "If you live in a stratified building, this website might be of interest to you. http://t.co/SWEUxBCP"
,"On the topic of Strata Talk. If you live in a stratified building, this website"
, "live in a stratified building, this website might be of interest"
,"Happy birthday to you now. On the topic of Strata Talk. If you live in a stratified building, this website might be of interest to you. http://t.co/SWEUxBCP"
,"On the topic of Strata Talk. If you live in a stratified building, this website might be of interest to you. http://t.co/SWEUxBCP Happy birthday to you now."
,"On tsdfadhe topic of Stasdfadsrata Talk. Ifasdf you live in a stratifdfgsdsded building, this website might be of interest to you. http://t.co/SWEUxBCP"
,"On tsdfadhe topic of Stasdfadsrata Talk. Ifasdf you live in a stratifdfgsdsded buisdfasdlding, this weasdfasdbfsite might be oasdffasdfasdt.co/SWEUxBCP"
,"Social media index for businesses and industries. Also, a couple of good info design ideas bit.ly/r5tfqg"
,"lk. If you live in a stratified building, this website might be of interest to you. http://t.co/SWEUxBCP"
]
for shift in range(5,40):
ss = SS2Similar(shift)
for i in range(0,len(strings)):
for j in range(i,len(strings)):
print(i,j,shift, ss.similarity(strings[i], strings[j]))
#print strings[i]
#print strings[j]