@@ -231,13 +231,14 @@ def _bisect_and_diff_tables(
231
231
):
232
232
assert table1 .is_bounded and table2 .is_bounded
233
233
234
+ max_space_size = max (table1 .approximate_size (), table2 .approximate_size ())
234
235
if max_rows is None :
235
- # We can be sure that row_count <= max_rows
236
- max_rows = max ( table1 . approximate_size (), table2 . approximate_size ())
236
+ # We can be sure that row_count <= max_rows iff the table key is unique
237
+ max_rows = max_space_size
237
238
238
239
# If count is below the threshold, just download and compare the columns locally
239
240
# This saves time, as bisection speed is limited by ping and query performance.
240
- if max_rows < self .bisection_threshold :
241
+ if max_rows < self .bisection_threshold or max_space_size < self . bisection_factor * 2 :
241
242
rows1 , rows2 = self ._threaded_call ("get_values" , [table1 , table2 ])
242
243
diff = list (diff_sets (rows1 , rows2 ))
243
244
@@ -255,7 +256,8 @@ def _bisect_and_diff_tables(
255
256
return diff
256
257
257
258
# Choose evenly spaced checkpoints (according to min_key and max_key)
258
- checkpoints = table1 .choose_checkpoints (self .bisection_factor - 1 )
259
+ biggest_table = max (table1 , table2 , key = methodcaller ('approximate_size' ))
260
+ checkpoints = biggest_table .choose_checkpoints (self .bisection_factor - 1 )
259
261
260
262
# Create new instances of TableSegment between each checkpoint
261
263
segmented1 = table1 .segment_by_checkpoints (checkpoints )
0 commit comments