Skip to content

Commit f9dd097

Browse files
authored
Merge pull request #27 from adrg/metrics-improvements
Fix support for multibyte characters
2 parents 3a48a17 + f86c21b commit f9dd097

File tree

9 files changed

+66
-28
lines changed

9 files changed

+66
-28
lines changed

Diff for: .github/workflows/ci.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@ jobs:
1010
runs-on: ubuntu-latest
1111
strategy:
1212
matrix:
13-
go: ['1.16']
13+
go: ['1.17']
1414
steps:
1515
- name: Setup
1616
uses: actions/setup-go@v4
1717
with:
1818
go-version: ${{ matrix.go }}
1919

2020
- name: Checkout
21-
uses: actions/checkout@v3
21+
uses: actions/checkout@v4
2222

2323
- name: Dependencies
2424
run: |

Diff for: .github/workflows/codeql-analysis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424

2525
steps:
2626
- name: Checkout repository
27-
uses: actions/checkout@v3
27+
uses: actions/checkout@v4
2828

2929
- name: Initialize CodeQL
3030
uses: github/codeql-action/init@v2

Diff for: go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ module github.com/adrg/strutil
22

33
go 1.14
44

5-
require github.com/stretchr/testify v1.8.2
5+
require github.com/stretchr/testify v1.8.4

Diff for: go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS
88
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
99
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
1010
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
11-
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
12-
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
11+
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
12+
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
1313
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
1414
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
1515
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

Diff for: internal/stringutil/stringutil.go

+4-6
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
package stringutil
22

3-
import "unicode/utf8"
4-
53
// CommonPrefix returns the common prefix of the specified strings. An empty
64
// string is returned if the parameters have no prefix in common.
75
func CommonPrefix(first, second string) string {
8-
if utf8.RuneCountInString(first) > utf8.RuneCountInString(second) {
9-
first, second = second, first
6+
fRunes, sRunes := []rune(first), []rune(second)
7+
if len(fRunes) > len(sRunes) {
8+
fRunes, sRunes = sRunes, fRunes
109
}
1110

1211
var commonLen int
13-
sRunes := []rune(second)
14-
for i, r := range first {
12+
for i, r := range fRunes {
1513
if r != sRunes[i] {
1614
break
1715
}

Diff for: internal/stringutil/stringutil_test.go

+8
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ func TestCommonPrefix(t *testing.T) {
1717
{"a", stringutil.CommonPrefix("aab", "ab")},
1818
{"aa", stringutil.CommonPrefix("aab", "aaab")},
1919
{"aa", stringutil.CommonPrefix("aaab", "aab")},
20+
{"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")},
21+
{"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")},
22+
{"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")},
23+
{"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")},
24+
{"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")},
25+
{"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")},
26+
{"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")},
27+
{"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")},
2028
})
2129
}
2230

Diff for: metrics/jaro.go

+10-7
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,25 @@ func (m *Jaro) Compare(a, b string) float64 {
6262
}
6363

6464
func matchingRunes(a, b string, limit int) []rune {
65-
common := []rune{}
66-
runesB := []rune(b)
67-
lenB := len(runesB)
68-
69-
for i, r := range a {
65+
var (
66+
runesA = []rune(a)
67+
runesB = []rune(b)
68+
runesCommon = []rune{}
69+
lenB = len(runesB)
70+
)
71+
72+
for i, r := range runesA {
7073
end := mathutil.Min(i+limit+1, lenB)
7174
for j := mathutil.Max(0, i-limit); j < end; j++ {
7275
if r == runesB[j] && runesB[j] != -1 {
73-
common = append(common, runesB[j])
76+
runesCommon = append(runesCommon, runesB[j])
7477
runesB[j] = -1
7578
break
7679
}
7780
}
7881
}
7982

80-
return common
83+
return runesCommon
8184
}
8285

8386
func transpositions(a, b []rune) int {

Diff for: metrics/levenshtein.go

+9-9
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package metrics
22

33
import (
44
"strings"
5-
"unicode/utf8"
65

76
"github.com/adrg/strutil/internal/mathutil"
87
)
@@ -56,8 +55,15 @@ func (m *Levenshtein) Distance(a, b string) int {
5655
}
5756

5857
func (m *Levenshtein) distance(a, b string) (int, int) {
58+
// Lower terms if case insensitive comparison is specified.
59+
if !m.CaseSensitive {
60+
a = strings.ToLower(a)
61+
b = strings.ToLower(b)
62+
}
63+
runesA, runesB := []rune(a), []rune(b)
64+
5965
// Check if both terms are empty.
60-
lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
66+
lenA, lenB := len(runesA), len(runesB)
6167
if lenA == 0 && lenB == 0 {
6268
return 0, 0
6369
}
@@ -71,12 +77,6 @@ func (m *Levenshtein) distance(a, b string) (int, int) {
7177
return m.DeleteCost * lenA, maxLen
7278
}
7379

74-
// Lower terms if case insensitive comparison is specified.
75-
if !m.CaseSensitive {
76-
a = strings.ToLower(a)
77-
b = strings.ToLower(b)
78-
}
79-
8080
// Initialize cost slice.
8181
prevCol := make([]int, lenB+1)
8282
for i := 0; i <= lenB; i++ {
@@ -92,7 +92,7 @@ func (m *Levenshtein) distance(a, b string) (int, int) {
9292
insCost := col[j] + m.InsertCost
9393

9494
subCost := prevCol[j]
95-
if a[i] != b[j] {
95+
if runesA[i] != runesB[j] {
9696
subCost += m.ReplaceCost
9797
}
9898

Diff for: metrics/metrics_test.go

+29
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ func TestHamming(t *testing.T) {
1717
require.Equal(t, 0, h.Distance("", ""))
1818
require.Equal(t, "0.75", sf(h.Compare("text", "test")))
1919
require.Equal(t, "0.50", sf(h.Compare("once", "one")))
20+
require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c")))
21+
require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c")))
22+
require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c")))
2023
h.CaseSensitive = false
2124
require.Equal(t, "0.50", sf(h.Compare("one", "ONCE")))
2225
}
@@ -25,6 +28,9 @@ func TestJaccard(t *testing.T) {
2528
j := metrics.NewJaccard()
2629
require.Equal(t, "1.00", sf(j.Compare("", "")))
2730
require.Equal(t, "0.00", sf(j.Compare("a", "b")))
31+
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
32+
require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c")))
33+
require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c")))
2834
require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
2935
j.NgramSize = 0
3036
require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
@@ -37,6 +43,9 @@ func TestJaro(t *testing.T) {
3743
j := metrics.NewJaro()
3844
require.Equal(t, "1.00", sf(j.Compare("", "")))
3945
require.Equal(t, "0.00", sf(j.Compare("test", "")))
46+
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
47+
require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c")))
48+
require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c")))
4049
require.Equal(t, "0.00", sf(j.Compare("a", "b")))
4150
require.Equal(t, "0.78", sf(j.Compare("sort", "shirt")))
4251
require.Equal(t, "0.64", sf(j.Compare("sort", "report")))
@@ -48,6 +57,9 @@ func TestJaroWinkler(t *testing.T) {
4857
j := metrics.NewJaroWinkler()
4958
require.Equal(t, "1.00", sf(j.Compare("", "")))
5059
require.Equal(t, "0.00", sf(j.Compare("test", "")))
60+
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
61+
require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c")))
62+
require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c")))
5163
require.Equal(t, "0.80", sf(j.Compare("sort", "shirt")))
5264
require.Equal(t, "0.94", sf(j.Compare("charm", "charmed")))
5365
j.CaseSensitive = false
@@ -59,11 +71,19 @@ func TestLevenshtein(t *testing.T) {
5971
require.Equal(t, 0, l.Distance("", ""))
6072
require.Equal(t, 4, l.Distance("test", ""))
6173
require.Equal(t, 4, l.Distance("", "test"))
74+
require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c"))
75+
require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c"))
76+
require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c"))
6277
require.Equal(t, "0.40", sf(l.Compare("book", "brick")))
78+
require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c")))
79+
require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c")))
6380
l.CaseSensitive = false
6481
require.Equal(t, "0.80", sf(l.Compare("hello", "jello")))
6582
l.ReplaceCost = 2
6683
require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO")))
84+
require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c")))
85+
require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c")))
86+
require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c")))
6787
}
6888

6989
func TestOperlapCoefficient(t *testing.T) {
@@ -72,6 +92,9 @@ func TestOperlapCoefficient(t *testing.T) {
7292
require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
7393
require.Equal(t, "0.00", sf(o.Compare("aa", "")))
7494
require.Equal(t, "0.00", sf(o.Compare("bb", "")))
95+
require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c")))
96+
require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c")))
97+
require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c")))
7598
o.NgramSize = 0
7699
require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
77100
require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa")))
@@ -87,6 +110,9 @@ func TestSmithWatermanGotoh(t *testing.T) {
87110
require.Equal(t, "0.00", sf(s.Compare("test", "")))
88111
require.Equal(t, "0.00", sf(s.Compare("", "test")))
89112
require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
113+
require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
114+
require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c")))
115+
require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c")))
90116
s.Substitution = nil
91117
require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
92118
s.CaseSensitive = false
@@ -103,6 +129,9 @@ func TestSorensenDice(t *testing.T) {
103129
require.Equal(t, "1.00", sf(s.Compare("", "")))
104130
require.Equal(t, "0.00", sf(s.Compare("a", "b")))
105131
require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
132+
require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
133+
require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c")))
134+
require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c")))
106135
s.NgramSize = 0
107136
require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
108137
s.CaseSensitive = false

0 commit comments

Comments
 (0)