Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 871016d

Browse files
authored
Merge pull request #249 from datafold/splitspace_error
Bugfix in algorithm: Trigger download if the segment space is smaller than the bisection factor
2 parents a4e29cd + 46a8c51 commit 871016d

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

data_diff/diff_tables.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,13 +231,14 @@ def _bisect_and_diff_tables(
231231
):
232232
assert table1.is_bounded and table2.is_bounded
233233

234+
max_space_size = max(table1.approximate_size(), table2.approximate_size())
234235
if max_rows is None:
235-
# We can be sure that row_count <= max_rows
236-
max_rows = max(table1.approximate_size(), table2.approximate_size())
236+
# We can be sure that row_count <= max_rows iff the table key is unique
237+
max_rows = max_space_size
237238

238239
# If count is below the threshold, just download and compare the columns locally
239240
# This saves time, as bisection speed is limited by ping and query performance.
240-
if max_rows < self.bisection_threshold:
241+
if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
241242
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
242243
diff = list(diff_sets(rows1, rows2))
243244

@@ -255,7 +256,8 @@ def _bisect_and_diff_tables(
255256
return diff
256257

257258
# Choose evenly spaced checkpoints (according to min_key and max_key)
258-
checkpoints = table1.choose_checkpoints(self.bisection_factor - 1)
259+
biggest_table = max(table1, table2, key=methodcaller('approximate_size'))
260+
checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1)
259261

260262
# Create new instances of TableSegment between each checkpoint
261263
segmented1 = table1.segment_by_checkpoints(checkpoints)

0 commit comments

Comments
 (0)