5
5
import org .apache .commons .lang3 .StringUtils ;
6
6
import org .apache .commons .text .similarity .JaccardSimilarity ;
7
7
import org .apache .commons .text .similarity .JaroWinklerSimilarity ;
8
+ import org .apache .commons .text .similarity .LevenshteinDistance ;
8
9
import org .apache .commons .text .similarity .SimilarityScore ;
9
10
import org .apache .logging .log4j .LogManager ;
10
11
import org .apache .logging .log4j .Logger ;
@@ -29,6 +30,8 @@ public final class LinkerProbabilistic {
29
30
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity ();
30
31
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity ();
31
32
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity ();
33
+ static final LevenshteinSimilarity LEVENSHTEIN_SIMILARITY = new LevenshteinSimilarity ();
34
+ static final LevenshteinSimilarityPercentage LEVENSHTEIN_SIMILARITY_PERCENTAGE = new LevenshteinSimilarityPercentage ();
32
35
private static final int METRIC_MIN = 0 ;
33
36
private static final int METRIC_MAX = 1 ;
34
37
private static final int METRIC_SCORE = 2 ;
@@ -79,7 +82,9 @@ public enum SimilarityFunctionName {
79
82
JARO_SIMILARITY ,
80
83
JACCARD_SIMILARITY ,
81
84
SOUNDEX_SIMILARITY ,
82
- EXACT_SIMILARITY
85
+ EXACT_SIMILARITY ,
86
+ LEVENSHTEIN_SIMILARITY ,
87
+ LEVENSHTEIN_SIMILARITY_PERCENTAGE
83
88
}
84
89
85
90
static SimilarityScore <Double > getSimilarityFunction (final SimilarityFunctionName func ) {
@@ -92,6 +97,10 @@ static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionNam
92
97
return JACCARD_SIMILARITY ;
93
98
case SOUNDEX_SIMILARITY :
94
99
return SOUNDEX_SIMILARITY ;
100
+ case LEVENSHTEIN_SIMILARITY :
101
+ return LEVENSHTEIN_SIMILARITY ;
102
+ case LEVENSHTEIN_SIMILARITY_PERCENTAGE :
103
+ return LEVENSHTEIN_SIMILARITY_PERCENTAGE ;
95
104
default :
96
105
return EXACT_SIMILARITY ;
97
106
}
@@ -312,6 +321,47 @@ public Double apply(
312
321
313
322
}
314
323
324
+ static class LevenshteinSimilarityPercentage implements SimilarityScore <Double > {
325
+
326
+ private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance ();
327
+
328
+ @ Override
329
+ public Double apply (
330
+ final CharSequence left ,
331
+ final CharSequence right ) {
332
+ if (StringUtils .isEmpty (left ) || StringUtils .isEmpty (right )) {
333
+ return 0.5 ;
334
+ }
335
+
336
+ int maxLength = Math .max (left .length (), right .length ());
337
+ double levenshteinDistanceValue = levenshteinDistance .apply (left , right );
338
+
339
+ // Invert the percentage value
340
+ double percentage = (levenshteinDistanceValue / maxLength ) * 100 ;
341
+ double invertedPercentage = 100 - percentage ;
342
+
343
+ return invertedPercentage ;
344
+ }
345
+
346
+ }
347
+
348
+ static class LevenshteinSimilarity implements SimilarityScore <Double > {
349
+
350
+ private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance ();
351
+
352
+ @ Override
353
+ public Double apply (
354
+ final CharSequence left ,
355
+ final CharSequence right ) {
356
+ if (StringUtils .isEmpty (left ) || StringUtils .isEmpty (right )) {
357
+ return 0.5 ;
358
+ }
359
+
360
+ return Double .valueOf (levenshteinDistance .apply (left , right ));
361
+ }
362
+
363
+ }
364
+
315
365
static class JaroSimilarity implements SimilarityScore <Double > {
316
366
317
367
@ Override
0 commit comments