Skip to content

Commit 4527d8b

Browse files
committed
logic_v2: Allow fuzzy if one is a prefix of the other
1 parent 97ba4f0 commit 4527d8b

File tree

1 file changed

+21
-10
lines changed

1 file changed

+21
-10
lines changed

nomenklatura/matching/logic_v2/names/distance.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,23 @@ def _edit_cost(op: str, qc: Optional[str], rc: Optional[str]) -> float:
6262
return 1.0
6363

6464

65-
def _costs_similarity(costs: List[float], max_cost_bias: float = 1.0) -> float:
66-
"""Calculate a similarity score based on a list of costs."""
67-
if len(costs) == 0:
68-
return 0.0
69-
# max_cost defines how many edits we allow for a given length.
65+
def _max_allowed_cost(length: int, max_cost_bias: float = 1.0) -> float:
7066
# We use a log here because for very long names, we don't want an anything goes
7167
# policy for very long name strings (~hundreds of characters).
7268
# The log-base is a bit of a magic number. We adjusted it so that for
7369
# len 8 it allows ~2 edits. That seems reasonable, but is also entirely arbitrary.
7470
# We use log(x-2) to disable fuzzy-matching completely for very short
7571
# names (often Chinese names in practice).
76-
max_cost = math.log(max(len(costs) - 2, 1), 2.35) * max_cost_bias
72+
return math.log(max(length - 2, 1), 2.35) * max_cost_bias
73+
74+
def _costs_similarity(costs: List[float], max_cost: float | None) -> float:
75+
"""Calculate a similarity score based on a list of costs."""
76+
if len(costs) == 0:
77+
return 0.0
7778
total_cost = sum(costs)
7879
if total_cost == 0:
7980
return 1.0
80-
if total_cost > max_cost:
81+
if max_cost is not None and total_cost > max_cost:
8182
return 0.0
8283
# Normalize the score to be between 0 and 1
8384
return 1 - (total_cost / len(costs))
@@ -160,9 +161,19 @@ def weighted_edit_similarity(
160161

161162
qcosts = unroll(costs.get(p, [1.0]) for p in match.qps)
162163
rcosts = unroll(costs.get(p, [1.0]) for p in match.rps)
163-
match.score = _costs_similarity(qcosts, max_cost_bias=bias) * _costs_similarity(
164-
rcosts, max_cost_bias=bias
165-
)
164+
165+
q_max_cost = _max_allowed_cost(len(qcosts))
166+
r_max_cost = _max_allowed_cost(len(rcosts))
167+
# If we've falled below the threshold for fuzzy matching, but one is a prefix of the other,
168+
# allow a fuzzy match anyway.
169+
if q_max_cost == 0 or r_max_cost == 0:
170+
if match.qstr.startswith(match.rstr) or match.rstr.startswith(match.qstr):
171+
q_max_cost = None
172+
r_max_cost = None
173+
# Downscore cause this whole things is a bit sketchy
174+
match.weight = 0.7
175+
176+
match.score = _costs_similarity(qcosts, max_cost=q_max_cost) * _costs_similarity(rcosts, max_cost=r_max_cost)
166177

167178
# Non-matched query parts: this penalizes scenarios where name parts in the query are
168179
# not matched to any name part in the result. Increasing this penalty will require queries

0 commit comments

Comments
 (0)