-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_clusterer.go
188 lines (157 loc) · 4.62 KB
/
string_clusterer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package string_clusterer
import (
"github.com/adrg/strutil/metrics"
)
type btreeNode struct {
left, right *btreeNode
values []string
}
func traverse(node *btreeNode, groups [][]string) [][]string {
if node == nil {
return groups
}
groups = append(groups, node.values)
groups = traverse(node.left, groups)
groups = traverse(node.right, groups)
return groups
}
type Clusterer struct {
similarityMetric SimilarityMetric
threshold float64
iterations uint64
}
func NewClusterer(options ...func(*Clusterer)) *Clusterer {
clusterer := &Clusterer{
similarityMetric: NewJaroWinkler(false),
threshold: 0.9,
iterations: 1,
}
for _, option := range options {
option(clusterer)
}
return clusterer
}
func WithSimilarityMetric(similarityMetric SimilarityMetric) func(*Clusterer) {
return func(s *Clusterer) {
s.similarityMetric = similarityMetric
}
}
func WithThreshold(threshold float64) func(*Clusterer) {
return func(s *Clusterer) {
s.threshold = threshold
}
}
func WithIterations(iterations uint64) func(*Clusterer) {
return func(s *Clusterer) {
s.iterations = iterations
}
}
// SimilarityMetric represents a metric for measuring the similarity between strings.
type SimilarityMetric interface {
Compare(a, b string) float64
}
// Cluster groups a slice of strings according to a similarity metric and a threshold.
func (clusterer Clusterer) Cluster(inputStrings []string) [][]string {
if len(inputStrings) == 0 {
return [][]string{}
}
result := make([][]string, len(inputStrings))
for i, v := range inputStrings {
result[i] = []string{v}
}
for range clusterer.iterations {
bTree := &btreeNode{nil, nil, result[0]}
for _, cluster := range result[1:] {
inputHead := cluster[0]
node := bTree
for {
nodeHead := node.values[0]
similarity := clusterer.similarityMetric.Compare(inputHead, nodeHead)
if similarity >= clusterer.threshold {
node.values = append(node.values, cluster...)
break
}
if node.left == nil {
node.left = &btreeNode{nil, nil, cluster}
break
}
leftHead := node.left.values[0]
leftSimilarity := clusterer.similarityMetric.Compare(inputHead, leftHead)
if node.right == nil {
if leftSimilarity >= clusterer.threshold {
node.left.values = append(node.left.values, cluster...)
break
} else {
node.right = &btreeNode{nil, nil, cluster}
break
}
}
rightHead := node.right.values[0]
rightSimilarity := clusterer.similarityMetric.Compare(inputHead, rightHead)
if leftSimilarity >= rightSimilarity {
if leftSimilarity >= clusterer.threshold {
node.left.values = append(node.left.values, cluster...)
break
}
node = node.left
} else {
if rightSimilarity >= clusterer.threshold {
node.right.values = append(node.right.values, cluster...)
break
}
node = node.right
}
}
}
result = traverse(bTree, make([][]string, 0))
}
return result
}
// NewHamming returns a new Hamming similarity metric.
func NewHamming(caseSensitive bool) SimilarityMetric {
metric := metrics.NewHamming()
metric.CaseSensitive = caseSensitive
return metric
}
// NewJaccard returns a new Jaccard similarity metric.
func NewJaccard(caseSensitive bool) SimilarityMetric {
metric := metrics.NewJaccard()
metric.CaseSensitive = caseSensitive
return metric
}
// NewJaro returns a new Jaro similarity metric.
func NewJaro(caseSensitive bool) SimilarityMetric {
metric := metrics.NewJaro()
metric.CaseSensitive = caseSensitive
return metric
}
// NewJaroWinkler returns a new JaroWinkler similarity metric.
func NewJaroWinkler(caseSensitive bool) SimilarityMetric {
metric := metrics.NewJaroWinkler()
metric.CaseSensitive = caseSensitive
return metric
}
// NewLevenshtein returns a new Levenshtein similarity metric.
func NewLevenshtein(caseSensitive bool) SimilarityMetric {
metric := metrics.NewLevenshtein()
metric.CaseSensitive = caseSensitive
return metric
}
// NewOverlapCoefficient returns a new OverlapCoefficient similarity metric.
func NewOverlapCoefficient(caseSensitive bool) SimilarityMetric {
metric := metrics.NewOverlapCoefficient()
metric.CaseSensitive = caseSensitive
return metric
}
// NewSmithWatermanGotoh returns a new SmithWatermanGotoh similarity metric.
func NewSmithWatermanGotoh(caseSensitive bool) SimilarityMetric {
metric := metrics.NewSmithWatermanGotoh()
metric.CaseSensitive = caseSensitive
return metric
}
// NewSorensenDice returns a new SorensenDice similarity metric.
func NewSorensenDice(caseSensitive bool) SimilarityMetric {
metric := metrics.NewSorensenDice()
metric.CaseSensitive = caseSensitive
return metric
}