@@ -62,22 +62,23 @@ def _edit_cost(op: str, qc: Optional[str], rc: Optional[str]) -> float:
6262 return 1.0
6363
6464
65- def _costs_similarity (costs : List [float ], max_cost_bias : float = 1.0 ) -> float :
66- """Calculate a similarity score based on a list of costs."""
67- if len (costs ) == 0 :
68- return 0.0
69- # max_cost defines how many edits we allow for a given length.
65+ def _max_allowed_cost (length : int , max_cost_bias : float = 1.0 ) -> float :
7066 # We use a log here because for very long names, we don't want an anything goes
7167 # policy for very long name strings (~hundreds of characters).
7268 # The log-base is a bit of a magic number. We adjusted it so that for
7369 # len 8 it allows ~2 edits. That seems reasonable, but is also entirely arbitrary.
7470 # We use log(x-2) to disable fuzzy-matching completely for very short
7571 # names (often Chinese names in practice).
76- max_cost = math .log (max (len (costs ) - 2 , 1 ), 2.35 ) * max_cost_bias
72+ return math .log (max (length - 2 , 1 ), 2.35 ) * max_cost_bias
73+
74+ def _costs_similarity (costs : List [float ], max_cost : float | None ) -> float :
75+ """Calculate a similarity score based on a list of costs."""
76+ if len (costs ) == 0 :
77+ return 0.0
7778 total_cost = sum (costs )
7879 if total_cost == 0 :
7980 return 1.0
80- if total_cost > max_cost :
81+ if max_cost is not None and total_cost > max_cost :
8182 return 0.0
8283 # Normalize the score to be between 0 and 1
8384 return 1 - (total_cost / len (costs ))
@@ -160,9 +161,19 @@ def weighted_edit_similarity(
160161
161162 qcosts = unroll (costs .get (p , [1.0 ]) for p in match .qps )
162163 rcosts = unroll (costs .get (p , [1.0 ]) for p in match .rps )
163- match .score = _costs_similarity (qcosts , max_cost_bias = bias ) * _costs_similarity (
164- rcosts , max_cost_bias = bias
165- )
164+
165+ q_max_cost = _max_allowed_cost (len (qcosts ))
166+ r_max_cost = _max_allowed_cost (len (rcosts ))
167+ # If we've falled below the threshold for fuzzy matching, but one is a prefix of the other,
168+ # allow a fuzzy match anyway.
169+ if q_max_cost == 0 or r_max_cost == 0 :
170+ if match .qstr .startswith (match .rstr ) or match .rstr .startswith (match .qstr ):
171+ q_max_cost = None
172+ r_max_cost = None
173+ # Downscore cause this whole things is a bit sketchy
174+ match .weight = 0.7
175+
176+ match .score = _costs_similarity (qcosts , max_cost = q_max_cost ) * _costs_similarity (rcosts , max_cost = r_max_cost )
166177
167178 # Non-matched query parts: this penalizes scenarios where name parts in the query are
168179 # not matched to any name part in the result. Increasing this penalty will require queries
0 commit comments