Skip to content

Commit 689105a

Browse files
committed
black
1 parent 3070fc8 commit 689105a

File tree

5 files changed

+129
-134
lines changed

5 files changed

+129
-134
lines changed

dedupe/__init__.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
1-
#!/usr/bin/python
2-
# -*- coding: utf-8 -*-
3-
from pkgutil import extend_path
4-
5-
__path__ = extend_path(__path__, __name__)
6-
7-
from dedupe._init import * # noqa
1+
from dedupe.api import ( # noqa: F401
2+
Dedupe,
3+
Gazetteer,
4+
RecordLink,
5+
StaticDedupe,
6+
StaticGazetteer,
7+
StaticRecordLink,
8+
)
9+
from dedupe.convenience import ( # noqa: F401
10+
canonicalize,
11+
console_label,
12+
training_data_dedupe,
13+
training_data_link,
14+
)
15+
from dedupe.serializer import read_training, write_training # noqa: F401

dedupe/_init.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

dedupe/branch_and_bound.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import functools
2+
import warnings
3+
from typing import Any, Iterable, Mapping, Sequence
4+
5+
from ._typing import Cover
6+
from .predicates import Predicate
7+
8+
Partial = tuple[Predicate, ...]
9+
10+
11+
def _reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int:
12+
return len(frozenset.union(*dupe_cover.values())) if dupe_cover else 0
13+
14+
15+
def _remove_dominated(coverage: Cover, dominator: Predicate) -> Cover:
16+
dominant_cover = coverage[dominator]
17+
18+
return {
19+
pred: cover
20+
for pred, cover in coverage.items()
21+
if not (dominator.cover_count <= pred.cover_count and dominant_cover >= cover)
22+
}
23+
24+
25+
def _uncovered_by(
26+
coverage: Mapping[Any, frozenset[int]], covered: frozenset[int]
27+
) -> dict[Any, frozenset[int]]:
28+
remaining = {}
29+
for predicate, uncovered in coverage.items():
30+
still_uncovered = uncovered - covered
31+
if still_uncovered:
32+
remaining[predicate] = still_uncovered
33+
34+
return remaining
35+
36+
37+
def _order_by(
38+
candidates: Mapping[Predicate, Sequence[Any]], p: Predicate
39+
) -> tuple[int, float]:
40+
return (len(candidates[p]), -p.cover_count)
41+
42+
43+
def _score(partial: Iterable[Predicate]) -> float:
44+
return sum(p.cover_count for p in partial)
45+
46+
47+
def _suppress_recursion_wrapper(func):
48+
def wrapper(*args, **kwargs):
49+
try:
50+
return func(*args, **kwargs)
51+
except RecursionError:
52+
warnings.warn("Recursion limit eached while searching for predicates")
53+
54+
return wrapper
55+
56+
57+
def search(candidates, target: int, max_calls: int) -> Partial:
58+
calls = max_calls
59+
60+
cheapest_score = float("inf")
61+
cheapest: Partial = ()
62+
63+
original_cover = candidates.copy()
64+
65+
def _covered(partial: Partial) -> int:
66+
return (
67+
len(frozenset.union(*(original_cover[p] for p in partial)))
68+
if partial
69+
else 0
70+
)
71+
72+
@_suppress_recursion_wrapper
73+
def walk(candidates: Cover, partial: Partial = ()) -> None:
74+
nonlocal calls
75+
nonlocal cheapest
76+
nonlocal cheapest_score
77+
78+
if calls <= 0:
79+
return
80+
81+
calls -= 1
82+
83+
covered = _covered(partial)
84+
score = _score(partial)
85+
86+
if covered < target:
87+
window = cheapest_score - score
88+
candidates = {
89+
p: cover for p, cover in candidates.items() if p.cover_count < window
90+
}
91+
92+
reachable = _reachable(candidates) + covered
93+
94+
if candidates and reachable >= target:
95+
order_by = functools.partial(_order_by, candidates)
96+
best = max(candidates, key=order_by)
97+
98+
remaining = _uncovered_by(candidates, candidates[best])
99+
walk(remaining, partial + (best,))
100+
del remaining
101+
102+
reduced = _remove_dominated(candidates, best)
103+
walk(reduced, partial)
104+
del reduced
105+
106+
elif score < cheapest_score:
107+
cheapest = partial
108+
cheapest_score = score
109+
110+
walk(candidates)
111+
return cheapest

dedupe/training.py

Lines changed: 3 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
from typing import TYPE_CHECKING, overload
1111
from warnings import warn
1212

13-
from . import blocking
13+
from . import blocking, branch_and_bound
1414

1515
if TYPE_CHECKING:
16-
from typing import Any, Iterable, Mapping, Sequence
16+
from typing import Iterable, Sequence
1717

1818
from ._typing import (
1919
ComparisonCover,
@@ -75,8 +75,7 @@ def learn(
7575
else:
7676
raise ValueError("candidate_type is not valid")
7777

78-
searcher = BranchBound(target_cover, 2500)
79-
final_predicates = searcher.search(candidate_cover)
78+
final_predicates = branch_and_bound.search(candidate_cover, target_cover, 2500)
8079

8180
logger.info("Final predicate set:")
8281
for predicate in final_predicates:
@@ -329,113 +328,6 @@ def coveredPairs(self, blocker, records_1, records_2):
329328
return pair_cover
330329

331330

332-
class BranchBound(object):
333-
def __init__(self, target: int, max_calls: int) -> None:
334-
self.target: int = target
335-
self.calls: int = max_calls
336-
337-
self.cheapest_score: float = float("inf")
338-
self.original_cover: Cover = {}
339-
self.cheapest: tuple[Predicate, ...] = ()
340-
341-
def search(
342-
self, candidates: Cover, partial: tuple[Predicate, ...] = ()
343-
) -> tuple[Predicate, ...]:
344-
if self.calls <= 0:
345-
return self.cheapest
346-
347-
if not self.original_cover:
348-
self.original_cover = candidates.copy()
349-
350-
self.calls -= 1
351-
352-
covered = self.covered(partial)
353-
score = self.score(partial)
354-
355-
if covered >= self.target:
356-
if score < self.cheapest_score:
357-
self.cheapest = partial
358-
self.cheapest_score = score
359-
360-
else:
361-
window = self.cheapest_score - score
362-
363-
candidates = {
364-
p: cover for p, cover in candidates.items() if p.cover_count < window
365-
}
366-
367-
reachable = self.reachable(candidates) + covered
368-
369-
if candidates and reachable >= self.target:
370-
order_by = functools.partial(self.order_by, candidates)
371-
372-
best = max(candidates, key=order_by)
373-
374-
remaining = self.uncovered_by(candidates, candidates[best])
375-
try:
376-
self.search(remaining, partial + (best,))
377-
except RecursionError:
378-
return self.cheapest
379-
380-
del remaining
381-
382-
reduced = self.remove_dominated(candidates, best)
383-
384-
try:
385-
self.search(reduced, partial)
386-
except RecursionError:
387-
return self.cheapest
388-
389-
del reduced
390-
391-
return self.cheapest
392-
393-
@staticmethod
394-
def order_by(
395-
candidates: Mapping[Predicate, Sequence[Any]], p: Predicate
396-
) -> tuple[int, float]:
397-
return (len(candidates[p]), -p.cover_count)
398-
399-
@staticmethod
400-
def score(partial: Iterable[Predicate]) -> float:
401-
return sum(p.cover_count for p in partial)
402-
403-
def covered(self, partial: tuple[Predicate, ...]) -> int:
404-
if partial:
405-
return len(frozenset.union(*(self.original_cover[p] for p in partial)))
406-
else:
407-
return 0
408-
409-
@staticmethod
410-
def reachable(dupe_cover: Mapping[Any, frozenset[int]]) -> int:
411-
if dupe_cover:
412-
return len(frozenset.union(*dupe_cover.values()))
413-
else:
414-
return 0
415-
416-
@staticmethod
417-
def remove_dominated(coverage: Cover, dominator: Predicate) -> Cover:
418-
dominant_cover = coverage[dominator]
419-
420-
for pred, cover in coverage.copy().items():
421-
if dominator.cover_count <= pred.cover_count and dominant_cover >= cover:
422-
del coverage[pred]
423-
424-
return coverage
425-
426-
@staticmethod
427-
def uncovered_by(
428-
coverage: Mapping[Any, frozenset[int]], covered: frozenset[int]
429-
) -> dict[Any, frozenset[int]]:
430-
remaining = {}
431-
for predicate, uncovered in coverage.items():
432-
still_uncovered = uncovered - covered
433-
if still_uncovered:
434-
remaining[predicate] = still_uncovered
435-
436-
return remaining
437-
438-
439331
class InfiniteSet(object):
440332
def __and__(self, item):
441333
return item

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ dependencies = [
2828
"scikit-learn",
2929
"affinegap>=1.3",
3030
"categorical-distance>=1.9",
31-
"dedupe-variable-datetime",
3231
"numpy>=1.20",
3332
"doublemetaphone",
3433
"highered>=0.2.0",

0 commit comments

Comments
 (0)