Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit f50bd64

Browse files
authored
Merge branch 'master' into config
2 parents 3505a4d + 68a9d3c commit f50bd64

File tree

6 files changed

+84
-18
lines changed

6 files changed

+84
-18
lines changed

data_diff/databases/base.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99

1010
from data_diff.utils import is_uuid, safezip
1111
from .database_types import (
12-
ColType_UUID,
1312
AbstractDatabase,
1413
ColType,
1514
Integer,
1615
Decimal,
1716
Float,
17+
ColType_UUID,
18+
Native_UUID,
19+
String_UUID,
1820
TemporalType,
1921
UnknownColType,
2022
Text,
@@ -162,7 +164,7 @@ def _parse_type(
162164
)
163165
)
164166

165-
elif issubclass(cls, Text):
167+
elif issubclass(cls, (Text, Native_UUID)):
166168
return cls()
167169

168170
raise TypeError(f"Parsing {type_repr} returned an unknown type '{cls}'.")
@@ -198,7 +200,7 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
198200
if not text_columns:
199201
return
200202

201-
fields = [self.normalize_uuid(c, ColType_UUID()) for c in text_columns]
203+
fields = [self.normalize_uuid(c, String_UUID()) for c in text_columns]
202204
samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
203205
if not samples_by_row:
204206
logger.warning(f"Table {table_path} is empty.")
@@ -216,7 +218,7 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
216218
)
217219
else:
218220
assert col_name in col_dict
219-
col_dict[col_name] = ColType_UUID()
221+
col_dict[col_name] = String_UUID()
220222

221223
# @lru_cache()
222224
# def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
@@ -241,7 +243,9 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
241243
return f"LIMIT {limit}"
242244

243245
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
244-
return f"TRIM({value})"
246+
if isinstance(coltype, String_UUID):
247+
return f"TRIM({value})"
248+
return self.to_string(value)
245249

246250

247251
class ThreadedDatabase(Database):

data_diff/databases/database_types.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,13 @@ class FractionalType(NumericType):
5858
class Float(FractionalType):
5959
pass
6060

61+
6162
class IKey(ABC):
6263
"Interface for ColType, for using a column as a key in data-diff"
6364
python_type: type
6465

6566

66-
class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
67+
class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
6768
@property
6869
def python_type(self) -> type:
6970
if self.precision == 0:
@@ -75,10 +76,18 @@ class StringType(ColType):
7576
pass
7677

7778

78-
class ColType_UUID(StringType, IKey):
79+
class ColType_UUID(ColType, IKey):
7980
python_type = ArithUUID
8081

8182

83+
class Native_UUID(ColType_UUID):
84+
pass
85+
86+
87+
class String_UUID(StringType, ColType_UUID):
88+
pass
89+
90+
8291
@dataclass
8392
class Text(StringType):
8493
supported = False

data_diff/databases/postgresql.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class PostgreSQL(ThreadedDatabase):
3030
"character varying": Text,
3131
"varchar": Text,
3232
"text": Text,
33+
# UUID
34+
"uuid": Native_UUID,
3335
}
3436
ROUNDS_ON_PREC_LOSS = True
3537

data_diff/diff_tables.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
from .databases.database_types import (
2020
ArithUUID,
2121
IKey,
22+
Native_UUID,
2223
NumericType,
2324
PrecisionType,
2425
StringType,
25-
UnknownColType,
2626
Schema,
2727
Schema_CaseInsensitive,
2828
Schema_CaseSensitive,
@@ -96,11 +96,22 @@ def _normalize_column(self, name: str, template: str = None) -> str:
9696
"Cannot compile query when the schema is unknown. Please use TableSegment.with_schema()."
9797
)
9898

99+
col_type = self._schema[name]
99100
col = self._quote_column(name)
101+
102+
if isinstance(col_type, Native_UUID):
103+
# Normalize first, apply template after (for uuids)
104+
# Needed because min/max(uuid) fails in postgresql
105+
col = self.database.normalize_value_by_type(col, col_type)
106+
if template is not None:
107+
col = template % col # Apply template using Python's string formatting
108+
return col
109+
110+
# Apply template before normalizing (for ints)
100111
if template is not None:
101112
col = template % col # Apply template using Python's string formatting
102113

103-
return self.database.normalize_value_by_type(col, self._schema[name])
114+
return self.database.normalize_value_by_type(col, col_type)
104115

105116
def with_schema(self) -> "TableSegment":
106117
"Queries the table schema from the database, and returns a new instance of TableSegmentWithSchema."

tests/test_diff_tables.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,8 @@ def setUp(self):
339339
self.null_uuid = uuid.uuid1(32132131)
340340
queries += [
341341
f"CREATE TABLE {self.table_dst} AS SELECT * FROM {self.table_src}",
342-
343342
f"INSERT INTO {self.table_src} VALUES ('{self.null_uuid}', NULL)",
344-
345-
"COMMIT"
343+
"COMMIT",
346344
]
347345

348346
for query in queries:
@@ -366,15 +364,11 @@ def setUp(self):
366364
f"DROP TABLE IF EXISTS {self.table_src}",
367365
f"DROP TABLE IF EXISTS {self.table_dst}",
368366
f"CREATE TABLE {self.table_src}(id varchar(100), comment varchar(1000))",
369-
370367
f"INSERT INTO {self.table_src} VALUES ('{uuid.uuid1(1)}', '1')",
371-
372368
f"CREATE TABLE {self.table_dst} AS SELECT * FROM {self.table_src}",
373-
374369
# Add a row where a column has NULL value
375370
f"INSERT INTO {self.table_src} VALUES ('{self.null_uuid}', NULL)",
376-
377-
"COMMIT"
371+
"COMMIT",
378372
]
379373

380374
for query in queries:
@@ -500,7 +494,7 @@ def test_left_table_empty(self):
500494
queries = [
501495
f"INSERT INTO {self.table_dst} SELECT id, comment FROM {self.table_src}",
502496
f"TRUNCATE {self.table_src}",
503-
"COMMIT"
497+
"COMMIT",
504498
]
505499
for query in queries:
506500
self.connection.query(query, None)

tests/test_postgresql.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import unittest
2+
3+
from data_diff.databases.connect import connect_to_uri
4+
from data_diff import TableSegment, TableDiffer
5+
from .common import TEST_POSTGRESQL_CONN_STRING, random_table_suffix
6+
7+
8+
class TestWithConnection(unittest.TestCase):
9+
def setUp(self) -> None:
10+
self.connection = connect_to_uri(TEST_POSTGRESQL_CONN_STRING)
11+
12+
self.connection.query('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";', None)
13+
14+
table_suffix = random_table_suffix()
15+
16+
self.table_src = f"src{table_suffix}"
17+
self.table_dst = f"dst{table_suffix}"
18+
19+
def test_uuid(self):
20+
queries = [
21+
f"DROP TABLE IF EXISTS {self.table_src}",
22+
f"DROP TABLE IF EXISTS {self.table_dst}",
23+
f"CREATE TABLE {self.table_src} (id uuid DEFAULT uuid_generate_v4 (), comment VARCHAR, PRIMARY KEY (id))",
24+
"COMMIT",
25+
]
26+
for i in range(100):
27+
queries.append(f"INSERT INTO {self.table_src}(comment) VALUES ('{i}')")
28+
29+
queries += [
30+
"COMMIT",
31+
f"CREATE TABLE {self.table_dst} AS SELECT * FROM {self.table_src}",
32+
"COMMIT",
33+
]
34+
35+
queries.append(f"INSERT INTO {self.table_src}(comment) VALUES ('This one is different')")
36+
37+
for query in queries:
38+
self.connection.query(query, None)
39+
40+
a = TableSegment(self.connection, (self.table_src,), "id", "comment")
41+
b = TableSegment(self.connection, (self.table_dst,), "id", "comment")
42+
43+
differ = TableDiffer()
44+
diff = list(differ.diff_tables(a, b))
45+
uuid = diff[0][1][0]
46+
self.assertEqual(diff, [("-", (uuid, "This one is different"))])

0 commit comments

Comments
 (0)