Skip to content

Commit 2a345e0

Browse files
authored
Merge pull request #48 from NationalBI/levenshtein-limits
Add a limit parameter to the {Weighted,}Levenshtein distance.
2 parents 495656d + 624fe28 commit 2a345e0

File tree

4 files changed

+69
-3
lines changed

4 files changed

+69
-3
lines changed

src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
@Immutable
1414
public class Levenshtein implements MetricStringDistance {
1515

16+
/**
17+
* Equivalent to distance(s1, s2, Integer.MAX_VALUE).
18+
*/
19+
public final double distance(final String s1, final String s2) {
20+
return distance(s1, s2, Integer.MAX_VALUE);
21+
}
22+
1623
/**
1724
* The Levenshtein distance, or edit distance, between two words is the
1825
* minimum number of single-character edits (insertions, deletions or
@@ -35,10 +42,16 @@ public class Levenshtein implements MetricStringDistance {
3542
*
3643
* @param s1 The first string to compare.
3744
* @param s2 The second string to compare.
45+
* @param limit The maximum result to compute before stopping. This
46+
* means that the calculation can terminate early if you
47+
* only care about strings with a certain similarity.
48+
* Set this to Integer.MAX_VALUE if you want to run the
49+
* calculation to completion in every case.
3850
* @return The computed Levenshtein distance.
3951
* @throws NullPointerException if s1 or s2 is null.
4052
*/
41-
public final double distance(final String s1, final String s2) {
53+
public final double distance(final String s1, final String s2,
54+
final int limit) {
4255
if (s1 == null) {
4356
throw new NullPointerException("s1 must not be null");
4457
}
@@ -77,6 +90,8 @@ public final double distance(final String s1, final String s2) {
7790
// edit distance is delete (i+1) chars from s to match empty t
7891
v1[0] = i + 1;
7992

93+
int minv1 = v1[0];
94+
8095
// use formula to fill in the rest of the row
8196
for (int j = 0; j < s2.length(); j++) {
8297
int cost = 1;
@@ -88,6 +103,12 @@ public final double distance(final String s1, final String s2) {
88103
Math.min(
89104
v0[j + 1] + 1, // Cost of remove
90105
v0[j] + cost)); // Cost of substitution
106+
107+
minv1 = Math.min(minv1, v1[j + 1]);
108+
}
109+
110+
if (minv1 >= limit) {
111+
return limit;
91112
}
92113

93114
// copy v1 (current row) to v0 (previous row) for next iteration

src/main/java/info/debatty/java/stringsimilarity/WeightedLevenshtein.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,27 @@ public WeightedLevenshtein(final CharacterSubstitutionInterface charsub,
5959
this.charchange = charchange;
6060
}
6161

62+
/**
63+
* Equivalent to distance(s1, s2, Double.MAX_VALUE).
64+
*/
65+
public final double distance(final String s1, final String s2) {
66+
return distance(s1, s2, Double.MAX_VALUE);
67+
}
68+
6269
/**
6370
* Compute Levenshtein distance using provided weights for substitution.
6471
* @param s1 The first string to compare.
6572
* @param s2 The second string to compare.
73+
* @param limit The maximum result to compute before stopping. This
74+
* means that the calculation can terminate early if you
75+
* only care about strings with a certain similarity.
76+
* Set this to Double.MAX_VALUE if you want to run the
77+
* calculation to completion in every case.
6678
* @return The computed weighted Levenshtein distance.
6779
* @throws NullPointerException if s1 or s2 is null.
6880
*/
69-
public final double distance(final String s1, final String s2) {
81+
public final double distance(final String s1, final String s2,
82+
final double limit) {
7083
if (s1 == null) {
7184
throw new NullPointerException("s1 must not be null");
7285
}
@@ -87,7 +100,7 @@ public final double distance(final String s1, final String s2) {
87100
return s1.length();
88101
}
89102

90-
// create two work vectors of integer distances
103+
// create two work vectors of floating point (i.e. weighted) distances
91104
double[] v0 = new double[s2.length() + 1];
92105
double[] v1 = new double[s2.length() + 1];
93106
double[] vtemp;
@@ -110,6 +123,8 @@ public final double distance(final String s1, final String s2) {
110123
// to match empty t.
111124
v1[0] = v0[0] + deletion_cost;
112125

126+
double minv1 = v1[0];
127+
113128
// use formula to fill in the rest of the row
114129
for (int j = 0; j < s2.length(); j++) {
115130
char s2j = s2.charAt(j);
@@ -123,6 +138,12 @@ public final double distance(final String s1, final String s2) {
123138
Math.min(
124139
v0[j + 1] + deletion_cost, // Cost of deletion
125140
v0[j] + cost)); // Cost of substitution
141+
142+
minv1 = Math.min(minv1, v1[j + 1]);
143+
}
144+
145+
if (minv1 >= limit) {
146+
return limit;
126147
}
127148

128149
// copy v1 (current row) to v0 (previous row) for next iteration

src/test/java/info/debatty/java/stringsimilarity/LevenshteinTest.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ public final void testDistance() {
4545
assertEquals(2.0, instance.distance("My string", "M string2"), 0.0);
4646
assertEquals(1.0, instance.distance("My string", "My $tring"), 0.0);
4747

48+
// With limits.
49+
assertEquals(2.0, instance.distance("My string", "M string2", 4), 0.0);
50+
assertEquals(2.0, instance.distance("My string", "M string2", 2), 0.0);
51+
assertEquals(1.0, instance.distance("My string", "M string2", 1), 0.0);
52+
4853
NullEmptyTests.testDistance(instance);
4954
}
5055
}

src/test/java/info/debatty/java/stringsimilarity/WeightedLevenshteinTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ public double cost(char c1, char c2) {
3131
assertEquals(1.0, instance.distance("Strng", "String"), 0.1);
3232
assertEquals(1.0, instance.distance("String", "Strng"), 0.1);
3333

34+
// With limits.
35+
assertEquals(0.0, instance.distance("String1", "String1", Double.MAX_VALUE), 0.1);
36+
assertEquals(0.0, instance.distance("String1", "String1", 2.0), 0.1);
37+
assertEquals(1.5, instance.distance("String1", "Srring2", Double.MAX_VALUE), 0.1);
38+
assertEquals(1.5, instance.distance("String1", "Srring2", 2.0), 0.1);
39+
assertEquals(1.5, instance.distance("String1", "Srring2", 1.5), 0.1);
40+
assertEquals(1.0, instance.distance("String1", "Srring2", 1.0), 0.1);
41+
assertEquals(4.0, instance.distance("String1", "Potato", 4.0), 0.1);
42+
3443
NullEmptyTests.testDistance(instance);
3544
}
3645

@@ -75,6 +84,16 @@ public double insertionCost(char c) {
7584
assertEquals(1.0, instance.distance("Strig", "String"), 0.1);
7685
assertEquals(1.0, instance.distance("String", "Strig"), 0.1);
7786

87+
// Same as above with limits.
88+
assertEquals(0.0, instance.distance("String1", "String1", Double.MAX_VALUE), 0.1);
89+
assertEquals(0.0, instance.distance("String1", "String1", 2.0), 0.1);
90+
assertEquals(0.5, instance.distance("String1", "Srring1", Double.MAX_VALUE), 0.1);
91+
assertEquals(0.5, instance.distance("String1", "Srring1", 2.0), 0.1);
92+
assertEquals(1.5, instance.distance("String1", "Srring2", 2.0), 0.1);
93+
assertEquals(1.5, instance.distance("String1", "Srring2", 1.5), 0.1);
94+
assertEquals(1.0, instance.distance("String1", "Srring2", 1.0), 0.1);
95+
assertEquals(4.0, instance.distance("String1", "Potato", 4.0), 0.1);
96+
7897
NullEmptyTests.testDistance(instance);
7998
}
8099
}

0 commit comments

Comments
 (0)