Skip to content

Commit 624fe28

Browse files
committed
Add a limit parameter to the {Weighted,}Levenshtein distance.
Add a limit parameter to Levenshtein and WeightedLevenshtein's distance methods. This causes the calculation to exit early if the limit is reached. This means that if the caller only cares about strings with a small distance, they can terminate early if the strings are found to be very different.
1 parent 495656d commit 624fe28

File tree

4 files changed

+69
-3
lines changed

4 files changed

+69
-3
lines changed

src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
@Immutable
1414
public class Levenshtein implements MetricStringDistance {
1515

16+
/**
17+
* Equivalent to distance(s1, s2, Integer.MAX_VALUE).
18+
*/
19+
public final double distance(final String s1, final String s2) {
20+
return distance(s1, s2, Integer.MAX_VALUE);
21+
}
22+
1623
/**
1724
* The Levenshtein distance, or edit distance, between two words is the
1825
* minimum number of single-character edits (insertions, deletions or
@@ -35,10 +42,16 @@ public class Levenshtein implements MetricStringDistance {
3542
*
3643
* @param s1 The first string to compare.
3744
* @param s2 The second string to compare.
45+
* @param limit The maximum result to compute before stopping. This
46+
* means that the calculation can terminate early if you
47+
* only care about strings with a certain similarity.
48+
* Set this to Integer.MAX_VALUE if you want to run the
49+
* calculation to completion in every case.
3850
* @return The computed Levenshtein distance.
3951
* @throws NullPointerException if s1 or s2 is null.
4052
*/
41-
public final double distance(final String s1, final String s2) {
53+
public final double distance(final String s1, final String s2,
54+
final int limit) {
4255
if (s1 == null) {
4356
throw new NullPointerException("s1 must not be null");
4457
}
@@ -77,6 +90,8 @@ public final double distance(final String s1, final String s2) {
7790
// edit distance is delete (i+1) chars from s to match empty t
7891
v1[0] = i + 1;
7992

93+
int minv1 = v1[0];
94+
8095
// use formula to fill in the rest of the row
8196
for (int j = 0; j < s2.length(); j++) {
8297
int cost = 1;
@@ -88,6 +103,12 @@ public final double distance(final String s1, final String s2) {
88103
Math.min(
89104
v0[j + 1] + 1, // Cost of remove
90105
v0[j] + cost)); // Cost of substitution
106+
107+
minv1 = Math.min(minv1, v1[j + 1]);
108+
}
109+
110+
if (minv1 >= limit) {
111+
return limit;
91112
}
92113

93114
// copy v1 (current row) to v0 (previous row) for next iteration

src/main/java/info/debatty/java/stringsimilarity/WeightedLevenshtein.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,27 @@ public WeightedLevenshtein(final CharacterSubstitutionInterface charsub,
5959
this.charchange = charchange;
6060
}
6161

62+
/**
63+
* Equivalent to distance(s1, s2, Double.MAX_VALUE).
64+
*/
65+
public final double distance(final String s1, final String s2) {
66+
return distance(s1, s2, Double.MAX_VALUE);
67+
}
68+
6269
/**
6370
* Compute Levenshtein distance using provided weights for substitution.
6471
* @param s1 The first string to compare.
6572
* @param s2 The second string to compare.
73+
* @param limit The maximum result to compute before stopping. This
74+
* means that the calculation can terminate early if you
75+
* only care about strings with a certain similarity.
76+
* Set this to Double.MAX_VALUE if you want to run the
77+
* calculation to completion in every case.
6678
* @return The computed weighted Levenshtein distance.
6779
* @throws NullPointerException if s1 or s2 is null.
6880
*/
69-
public final double distance(final String s1, final String s2) {
81+
public final double distance(final String s1, final String s2,
82+
final double limit) {
7083
if (s1 == null) {
7184
throw new NullPointerException("s1 must not be null");
7285
}
@@ -87,7 +100,7 @@ public final double distance(final String s1, final String s2) {
87100
return s1.length();
88101
}
89102

90-
// create two work vectors of integer distances
103+
// create two work vectors of floating point (i.e. weighted) distances
91104
double[] v0 = new double[s2.length() + 1];
92105
double[] v1 = new double[s2.length() + 1];
93106
double[] vtemp;
@@ -110,6 +123,8 @@ public final double distance(final String s1, final String s2) {
110123
// to match empty t.
111124
v1[0] = v0[0] + deletion_cost;
112125

126+
double minv1 = v1[0];
127+
113128
// use formula to fill in the rest of the row
114129
for (int j = 0; j < s2.length(); j++) {
115130
char s2j = s2.charAt(j);
@@ -123,6 +138,12 @@ public final double distance(final String s1, final String s2) {
123138
Math.min(
124139
v0[j + 1] + deletion_cost, // Cost of deletion
125140
v0[j] + cost)); // Cost of substitution
141+
142+
minv1 = Math.min(minv1, v1[j + 1]);
143+
}
144+
145+
if (minv1 >= limit) {
146+
return limit;
126147
}
127148

128149
// copy v1 (current row) to v0 (previous row) for next iteration

src/test/java/info/debatty/java/stringsimilarity/LevenshteinTest.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ public final void testDistance() {
4545
assertEquals(2.0, instance.distance("My string", "M string2"), 0.0);
4646
assertEquals(1.0, instance.distance("My string", "My $tring"), 0.0);
4747

48+
// With limits.
49+
assertEquals(2.0, instance.distance("My string", "M string2", 4), 0.0);
50+
assertEquals(2.0, instance.distance("My string", "M string2", 2), 0.0);
51+
assertEquals(1.0, instance.distance("My string", "M string2", 1), 0.0);
52+
4853
NullEmptyTests.testDistance(instance);
4954
}
5055
}

src/test/java/info/debatty/java/stringsimilarity/WeightedLevenshteinTest.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ public double cost(char c1, char c2) {
3131
assertEquals(1.0, instance.distance("Strng", "String"), 0.1);
3232
assertEquals(1.0, instance.distance("String", "Strng"), 0.1);
3333

34+
// With limits.
35+
assertEquals(0.0, instance.distance("String1", "String1", Double.MAX_VALUE), 0.1);
36+
assertEquals(0.0, instance.distance("String1", "String1", 2.0), 0.1);
37+
assertEquals(1.5, instance.distance("String1", "Srring2", Double.MAX_VALUE), 0.1);
38+
assertEquals(1.5, instance.distance("String1", "Srring2", 2.0), 0.1);
39+
assertEquals(1.5, instance.distance("String1", "Srring2", 1.5), 0.1);
40+
assertEquals(1.0, instance.distance("String1", "Srring2", 1.0), 0.1);
41+
assertEquals(4.0, instance.distance("String1", "Potato", 4.0), 0.1);
42+
3443
NullEmptyTests.testDistance(instance);
3544
}
3645

@@ -75,6 +84,16 @@ public double insertionCost(char c) {
7584
assertEquals(1.0, instance.distance("Strig", "String"), 0.1);
7685
assertEquals(1.0, instance.distance("String", "Strig"), 0.1);
7786

87+
// Same as above with limits.
88+
assertEquals(0.0, instance.distance("String1", "String1", Double.MAX_VALUE), 0.1);
89+
assertEquals(0.0, instance.distance("String1", "String1", 2.0), 0.1);
90+
assertEquals(0.5, instance.distance("String1", "Srring1", Double.MAX_VALUE), 0.1);
91+
assertEquals(0.5, instance.distance("String1", "Srring1", 2.0), 0.1);
92+
assertEquals(1.5, instance.distance("String1", "Srring2", 2.0), 0.1);
93+
assertEquals(1.5, instance.distance("String1", "Srring2", 1.5), 0.1);
94+
assertEquals(1.0, instance.distance("String1", "Srring2", 1.0), 0.1);
95+
assertEquals(4.0, instance.distance("String1", "Potato", 4.0), 0.1);
96+
7897
NullEmptyTests.testDistance(instance);
7998
}
8099
}

0 commit comments

Comments
 (0)