Skip to content

Commit fe8c001

Browse files
committed
create levenshtein similarity function for literal distance and as a percentage (inverted)
1 parent 9e5efbf commit fe8c001

File tree

1 file changed

+51
-1
lines changed

1 file changed

+51
-1
lines changed

JeMPI_Apps/JeMPI_Linker/src/main/java/org/jembi/jempi/linker/backend/LinkerProbabilistic.java

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import org.apache.commons.lang3.StringUtils;
66
import org.apache.commons.text.similarity.JaccardSimilarity;
77
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
8+
import org.apache.commons.text.similarity.LevenshteinDistance;
89
import org.apache.commons.text.similarity.SimilarityScore;
910
import org.apache.logging.log4j.LogManager;
1011
import org.apache.logging.log4j.Logger;
@@ -29,6 +30,8 @@ public final class LinkerProbabilistic {
2930
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity();
3031
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity();
3132
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity();
33+
static final LevenshteinSimilarity LEVENSHTEIN_SIMILARITY = new LevenshteinSimilarity();
34+
static final LevenshteinSimilarityPercentage LEVENSHTEIN_SIMILARITY_PERCENTAGE = new LevenshteinSimilarityPercentage();
3235
private static final int METRIC_MIN = 0;
3336
private static final int METRIC_MAX = 1;
3437
private static final int METRIC_SCORE = 2;
@@ -79,7 +82,9 @@ public enum SimilarityFunctionName {
7982
JARO_SIMILARITY,
8083
JACCARD_SIMILARITY,
8184
SOUNDEX_SIMILARITY,
82-
EXACT_SIMILARITY
85+
EXACT_SIMILARITY,
86+
LEVENSHTEIN_SIMILARITY,
87+
LEVENSHTEIN_SIMILARITY_PERCENTAGE
8388
}
8489

8590
static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionName func) {
@@ -92,6 +97,10 @@ static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionNam
9297
return JACCARD_SIMILARITY;
9398
case SOUNDEX_SIMILARITY:
9499
return SOUNDEX_SIMILARITY;
100+
case LEVENSHTEIN_SIMILARITY:
101+
return LEVENSHTEIN_SIMILARITY;
102+
case LEVENSHTEIN_SIMILARITY_PERCENTAGE:
103+
return LEVENSHTEIN_SIMILARITY_PERCENTAGE;
95104
default:
96105
return EXACT_SIMILARITY;
97106
}
@@ -312,6 +321,47 @@ public Double apply(
312321

313322
}
314323

324+
static class LevenshteinSimilarityPercentage implements SimilarityScore<Double> {
325+
326+
private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
327+
328+
@Override
329+
public Double apply(
330+
final CharSequence left,
331+
final CharSequence right) {
332+
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
333+
return 0.5;
334+
}
335+
336+
int maxLength = Math.max(left.length(), right.length());
337+
double levenshteinDistanceValue = levenshteinDistance.apply(left, right);
338+
339+
// Invert the percentage value
340+
double percentage = (levenshteinDistanceValue / maxLength) * 100;
341+
double invertedPercentage = 100 - percentage;
342+
343+
return invertedPercentage;
344+
}
345+
346+
}
347+
348+
static class LevenshteinSimilarity implements SimilarityScore<Double> {
349+
350+
private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
351+
352+
@Override
353+
public Double apply(
354+
final CharSequence left,
355+
final CharSequence right) {
356+
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
357+
return 0.5;
358+
}
359+
360+
return Double.valueOf(levenshteinDistance.apply(left, right));
361+
}
362+
363+
}
364+
315365
static class JaroSimilarity implements SimilarityScore<Double> {
316366

317367
@Override

0 commit comments

Comments
 (0)