Skip to content

Commit 792e3dd

Browse files
chore: replace editdistance by rapidfuzz for Levenshtein distance
1 parent f34c34a commit 792e3dd

File tree

2 files changed

+3
-21
lines changed

2 files changed

+3
-21
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ amazon-textract-caller>=0.2.4,<1
22
Pillow
33
tabulate>=0.9,<0.10
44
XlsxWriter>=3.0,<4
5-
editdistance>=0.6.2,<0.9
5+
rapidfuzz>=3.9.6

textractor/utils/search_utils.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
# The latter has numpy as dependency.
88
pass
99

10-
import math
11-
import editdistance
10+
from rapidfuzz.distance import Levenshtein
1211
from textractor.data.constants import SimilarityMetric
1312
from textractor.exceptions import MissingDependencyException
1413

@@ -59,7 +58,7 @@ def get_word_similarity(
5958
cls.util = util
6059

6160
if similarity_metric == SimilarityMetric.LEVENSHTEIN:
62-
return normalized_edit_distance(word_1.lower(), word_2.lower())
61+
return Levenshtein.normalized_similarity(word_1.lower(), word_2.lower())
6362
elif similarity_metric == SimilarityMetric.EUCLIDEAN:
6463
ref_word_emb = cls.model.encode([word_1])
6564
word_emb = cls.model.encode([word_2])
@@ -110,20 +109,3 @@ def get_metadata_attr_name(cell_atr):
110109
return cell_map[cell_atr]
111110
except:
112111
return ""
113-
114-
115-
def normalized_edit_distance(s1: str, s2: str):
116-
"""
117-
Returns the normalized edit distance
118-
119-
:param s1: First string
120-
:type s1: str
121-
:param s2: Second string
122-
:type s2: str
123-
"""
124-
125-
dist = editdistance.eval(s1, s2)
126-
max_length = max(len(s1), len(s2))
127-
if max_length - dist == 0:
128-
return 0.0
129-
return (max_length - dist) / max_length

0 commit comments

Comments
 (0)