Skip to content

Commit

Permalink
create levenshtein similarity function for literal distance and as a …
Browse files Browse the repository at this point in the history
…percentage (inverted)
  • Loading branch information
MatthewErispe committed Oct 17, 2024
1 parent 9e5efbf commit fe8c001
Showing 1 changed file with 51 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.commons.text.similarity.SimilarityScore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -29,6 +30,8 @@ public final class LinkerProbabilistic {
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity();
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity();
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity();
static final LevenshteinSimilarity LEVENSHTEIN_SIMILARITY = new LevenshteinSimilarity();
static final LevenshteinSimilarityPercentage LEVENSHTEIN_SIMILARITY_PERCENTAGE = new LevenshteinSimilarityPercentage();
private static final int METRIC_MIN = 0;
private static final int METRIC_MAX = 1;
private static final int METRIC_SCORE = 2;
Expand Down Expand Up @@ -79,7 +82,9 @@ public enum SimilarityFunctionName {
JARO_SIMILARITY,
JACCARD_SIMILARITY,
SOUNDEX_SIMILARITY,
EXACT_SIMILARITY
EXACT_SIMILARITY,
LEVENSHTEIN_SIMILARITY,
LEVENSHTEIN_SIMILARITY_PERCENTAGE
}

static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionName func) {
Expand All @@ -92,6 +97,10 @@ static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionNam
return JACCARD_SIMILARITY;
case SOUNDEX_SIMILARITY:
return SOUNDEX_SIMILARITY;
case LEVENSHTEIN_SIMILARITY:
return LEVENSHTEIN_SIMILARITY;
case LEVENSHTEIN_SIMILARITY_PERCENTAGE:
return LEVENSHTEIN_SIMILARITY_PERCENTAGE;
default:
return EXACT_SIMILARITY;
}
Expand Down Expand Up @@ -312,6 +321,47 @@ public Double apply(

}

static class LevenshteinSimilarityPercentage implements SimilarityScore<Double> {

private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

int maxLength = Math.max(left.length(), right.length());
double levenshteinDistanceValue = levenshteinDistance.apply(left, right);

// Invert the percentage value
double percentage = (levenshteinDistanceValue / maxLength) * 100;
double invertedPercentage = 100 - percentage;

return invertedPercentage;
}

}

static class LevenshteinSimilarity implements SimilarityScore<Double> {

private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

return Double.valueOf(levenshteinDistance.apply(left, right));
}

}

static class JaroSimilarity implements SimilarityScore<Double> {

@Override
Expand Down

0 comments on commit fe8c001

Please sign in to comment.