Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit fc7a187

Browse files
authored
Merge pull request #148 from datafold/pik94-fix-uuid-things
Corrections for PR #144 - fix UUID things
2 parents b2549ea + 715c12c commit fc7a187

File tree

5 files changed

+205
-8
lines changed

5 files changed

+205
-8
lines changed

data_diff/databases/base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,14 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
200200

201201
fields = [self.normalize_uuid(c, ColType_UUID()) for c in text_columns]
202202
samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
203+
if not samples_by_row:
204+
logger.warning(f"Table {table_path} is empty.")
205+
return
206+
203207
samples_by_col = list(zip(*samples_by_row))
208+
204209
for col_name, samples in safezip(text_columns, samples_by_col):
205-
uuid_samples = list(filter(is_uuid, samples))
210+
uuid_samples = [s for s in samples if s and is_uuid(s)]
206211

207212
if uuid_samples:
208213
if len(uuid_samples) != len(samples):

data_diff/databases/database_types.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,11 @@ class FractionalType(NumericType):
5353
class Float(FractionalType):
5454
pass
5555

56+
class IKey(ABC):
57+
"Interface for ColType, for using a column as a key in data-diff"
58+
python_type: type
5659

57-
class Decimal(FractionalType):
60+
class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
5861
@property
5962
def python_type(self) -> type:
6063
if self.precision == 0:
@@ -66,11 +69,6 @@ class StringType(ColType):
6669
pass
6770

6871

69-
class IKey(ABC):
70-
"Interface for ColType, for using a column as a key in data-diff"
71-
python_type: type
72-
73-
7472
class ColType_UUID(StringType, IKey):
7573
python_type = ArithUUID
7674

data_diff/diff_tables.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from .databases.base import Database
1919
from .databases.database_types import (
2020
ArithUUID,
21+
IKey,
2122
NumericType,
2223
PrecisionType,
2324
StringType,
@@ -212,6 +213,8 @@ def count_and_checksum(self) -> Tuple[int, int]:
212213
"We recommend increasing --bisection-factor or decreasing --threads."
213214
)
214215

216+
if count:
217+
assert checksum, (count, checksum)
215218
return count or 0, checksum if checksum is None else int(checksum)
216219

217220
def query_key_range(self) -> Tuple[int, int]:
@@ -306,6 +309,10 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
306309

307310
key_type = table1._schema[table1.key_column]
308311
key_type2 = table2._schema[table2.key_column]
312+
if not isinstance(key_type, IKey):
313+
raise NotImplementedError(f"Cannot use column of type {key_type} as a key")
314+
if not isinstance(key_type2, IKey):
315+
raise NotImplementedError(f"Cannot use column of type {key_type2} as a key")
309316
assert key_type.python_type is key_type2.python_type
310317

311318
# We add 1 because our ranges are exclusive of the end (like in Python)

data_diff/sql.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,10 @@ class Checksum(Sql):
121121

122122
def compile(self, c: Compiler):
123123
if len(self.exprs) > 1:
124-
compiled_exprs = ", ".join(map(c.compile, self.exprs))
124+
compiled_exprs = ", ".join(f"coalesce({c.compile(expr)}, '<null>')" for expr in self.exprs)
125125
expr = f"concat({compiled_exprs})"
126126
else:
127+
# No need to coalesce - safe to assume that key cannot be null
127128
(expr,) = self.exprs
128129
expr = c.compile(expr)
129130
md5 = c.database.md5_to_int(expr)

tests/test_diff_tables.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,189 @@ def test_table_segment(self):
321321
self.assertRaises(ValueError, self.table.replace, min_update=late, max_update=early)
322322

323323
self.assertRaises(ValueError, self.table.replace, min_key=10, max_key=0)
324+
325+
326+
class TestTableUUID(TestWithConnection):
327+
def setUp(self):
328+
super().setUp()
329+
330+
queries = [
331+
f"DROP TABLE IF EXISTS {self.table_src}",
332+
f"DROP TABLE IF EXISTS {self.table_dst}",
333+
f"CREATE TABLE {self.table_src}(id varchar(100), comment varchar(1000))",
334+
]
335+
for i in range(10):
336+
uuid_value = uuid.uuid1(i)
337+
queries.append(f"INSERT INTO {self.table_src} VALUES ('{uuid_value}', '{uuid_value}')")
338+
339+
self.null_uuid = uuid.uuid1(32132131)
340+
queries += [
341+
f"CREATE TABLE {self.table_dst} AS SELECT * FROM {self.table_src}",
342+
343+
f"INSERT INTO {self.table_src} VALUES ('{self.null_uuid}', NULL)",
344+
345+
"COMMIT"
346+
]
347+
348+
for query in queries:
349+
self.connection.query(query, None)
350+
351+
self.a = TableSegment(self.connection, (self.table_src,), "id", "comment")
352+
self.b = TableSegment(self.connection, (self.table_dst,), "id", "comment")
353+
354+
def test_uuid_column_with_nulls(self):
355+
differ = TableDiffer()
356+
diff = list(differ.diff_tables(self.a, self.b))
357+
self.assertEqual(diff, [("-", (str(self.null_uuid), None))])
358+
359+
360+
class TestTableNullRowChecksum(TestWithConnection):
361+
def setUp(self):
362+
super().setUp()
363+
364+
self.null_uuid = uuid.uuid1(1)
365+
queries = [
366+
f"DROP TABLE IF EXISTS {self.table_src}",
367+
f"DROP TABLE IF EXISTS {self.table_dst}",
368+
f"CREATE TABLE {self.table_src}(id varchar(100), comment varchar(1000))",
369+
370+
f"INSERT INTO {self.table_src} VALUES ('{uuid.uuid1(1)}', '1')",
371+
372+
f"CREATE TABLE {self.table_dst} AS SELECT * FROM {self.table_src}",
373+
374+
# Add a row where a column has NULL value
375+
f"INSERT INTO {self.table_src} VALUES ('{self.null_uuid}', NULL)",
376+
377+
"COMMIT"
378+
]
379+
380+
for query in queries:
381+
self.connection.query(query, None)
382+
383+
self.a = TableSegment(self.connection, (self.table_src,), "id", "comment")
384+
self.b = TableSegment(self.connection, (self.table_dst,), "id", "comment")
385+
386+
def test_uuid_columns_with_nulls(self):
387+
"""
388+
Here we test a case when in one segment one or more columns has only null values. For example,
389+
Table A:
390+
| id | value |
391+
|------|-----------|
392+
| pk_1 | 'value_1' |
393+
| pk_2 | NULL |
394+
395+
Table B:
396+
| id | value |
397+
|------|-----------|
398+
| pk_1 | 'value_1' |
399+
400+
We can choose some bisection factor and bisection threshold (2 and 3 for our example, respectively)
401+
that one segment will look like ('pk_2', NULL). Some databases, when we do a cast these values to string and
402+
try to concatenate, some databases return NULL when concatenating (for example, MySQL). As the result, all next
403+
operations like substring, sum etc return nulls that leads incorrect diff results: ('pk_2', null) should be in
404+
diff results, but it's not. This test helps to detect such cases.
405+
"""
406+
407+
differ = TableDiffer(bisection_factor=2, bisection_threshold=3)
408+
diff = list(differ.diff_tables(self.a, self.b))
409+
self.assertEqual(diff, [("-", (str(self.null_uuid), None))])
410+
411+
412+
class TestConcatMultipleColumnWithNulls(TestWithConnection):
413+
def setUp(self):
414+
super().setUp()
415+
416+
queries = [
417+
f"DROP TABLE IF EXISTS {self.table_src}",
418+
f"DROP TABLE IF EXISTS {self.table_dst}",
419+
f"CREATE TABLE {self.table_src}(id varchar(100), c1 varchar(100), c2 varchar(100))",
420+
f"CREATE TABLE {self.table_dst}(id varchar(100), c1 varchar(100), c2 varchar(100))",
421+
]
422+
423+
self.diffs = []
424+
for i in range(0, 8):
425+
pk = uuid.uuid1(i)
426+
table_src_c1_val = str(i)
427+
table_dst_c1_val = str(i) + "-different"
428+
429+
queries.append(f"INSERT INTO {self.table_src} VALUES ('{pk}', '{table_src_c1_val}', NULL)")
430+
queries.append(f"INSERT INTO {self.table_dst} VALUES ('{pk}', '{table_dst_c1_val}', NULL)")
431+
432+
self.diffs.append(("-", (str(pk), table_src_c1_val, None)))
433+
self.diffs.append(("+", (str(pk), table_dst_c1_val, None)))
434+
435+
queries.append("COMMIT")
436+
437+
for query in queries:
438+
self.connection.query(query, None)
439+
440+
self.a = TableSegment(self.connection, (self.table_src,), "id", extra_columns=("c1", "c2"))
441+
self.b = TableSegment(self.connection, (self.table_dst,), "id", extra_columns=("c1", "c2"))
442+
443+
def test_tables_are_different(self):
444+
"""
445+
Here we test a case when in one segment one or more columns has only null values. For example,
446+
Table A:
447+
| id | c1 | c2 |
448+
|------|----|------|
449+
| pk_1 | 1 | NULL |
450+
| pk_2 | 2 | NULL |
451+
...
452+
| pk_n | n | NULL |
453+
454+
Table B:
455+
| id | c1 | c2 |
456+
|------|--------|------|
457+
| pk_1 | 1-diff | NULL |
458+
| pk_2 | 2-diff | NULL |
459+
...
460+
| pk_n | n-diff | NULL |
461+
462+
To calculate a checksum, we need to concatenate string values by rows. If both tables have columns with NULL
463+
value, it may lead that concat(pk_i, i, NULL) == concat(pk_i, i-diff, NULL). This test handle such cases.
464+
"""
465+
466+
differ = TableDiffer(bisection_factor=2, bisection_threshold=4)
467+
diff = list(differ.diff_tables(self.a, self.b))
468+
self.assertEqual(diff, self.diffs)
469+
470+
471+
class TestTableTableEmpty(TestWithConnection):
472+
def setUp(self):
473+
super().setUp()
474+
475+
self.null_uuid = uuid.uuid1(1)
476+
queries = [
477+
f"DROP TABLE IF EXISTS {self.table_src}",
478+
f"DROP TABLE IF EXISTS {self.table_dst}",
479+
f"CREATE TABLE {self.table_src}(id varchar(100), comment varchar(1000))",
480+
f"CREATE TABLE {self.table_dst}(id varchar(100), comment varchar(1000))",
481+
]
482+
483+
self.diffs = [(uuid.uuid1(i), i) for i in range(100)]
484+
for pk, value in self.diffs:
485+
queries.append(f"INSERT INTO {self.table_src} VALUES ('{pk}', '{value}')")
486+
487+
queries.append("COMMIT")
488+
489+
for query in queries:
490+
self.connection.query(query, None)
491+
492+
self.a = TableSegment(self.connection, (self.table_src,), "id", "comment")
493+
self.b = TableSegment(self.connection, (self.table_dst,), "id", "comment")
494+
495+
def test_right_table_empty(self):
496+
differ = TableDiffer()
497+
self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)
498+
499+
def test_left_table_empty(self):
500+
queries = [
501+
f"INSERT INTO {self.table_dst} SELECT id, comment FROM {self.table_src}",
502+
f"TRUNCATE {self.table_src}",
503+
"COMMIT"
504+
]
505+
for query in queries:
506+
self.connection.query(query, None)
507+
508+
differ = TableDiffer()
509+
self.assertRaises(ValueError, differ.diff_tables, self.a, self.b)

0 commit comments

Comments
 (0)