Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 746b72e

Browse files
authored
Merge pull request #383 from nicolasaldecoa/test-sqeleton-pr15
Json matching & tests for sqeleton PR #15
2 parents 14193b9 + 39445bf commit 746b72e

File tree

3 files changed

+76
-13
lines changed

3 files changed

+76
-13
lines changed

data_diff/hashdiff_tables.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77

88
from runtype import dataclass
99

10-
from data_diff.sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean
10+
from data_diff.sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean, JSONType
1111

1212
from .info_tree import InfoTree
13-
from .utils import safezip
13+
from .utils import safezip, diffs_are_equiv_jsons
1414
from .thread_utils import ThreadedYielder
1515
from .table_segment import TableSegment
1616

@@ -24,7 +24,7 @@
2424
logger = logging.getLogger("hashdiff_tables")
2525

2626

27-
def diff_sets(a: set, b: set) -> Iterator:
27+
def diff_sets(a: list, b: list, json_cols: dict = None) -> Iterator:
2828
sa = set(a)
2929
sb = set(b)
3030

@@ -38,7 +38,17 @@ def diff_sets(a: set, b: set) -> Iterator:
3838
if row not in sa:
3939
d[row[0]].append(("+", row))
4040

41+
warned_diff_cols = set()
4142
for _k, v in sorted(d.items(), key=lambda i: i[0]):
43+
if json_cols:
44+
parsed_match, overriden_diff_cols = diffs_are_equiv_jsons(v, json_cols)
45+
if parsed_match:
46+
to_warn = overriden_diff_cols - warned_diff_cols
47+
for w in to_warn:
48+
logger.warning(f"Equivalent JSON objects with different string representations detected "
49+
f"in column '{w}'. These cases are NOT reported as differences.")
50+
warned_diff_cols.add(w)
51+
continue
4252
yield from v
4353

4454

@@ -194,7 +204,9 @@ def _bisect_and_diff_segments(
194204
# This saves time, as bisection speed is limited by ping and query performance.
195205
if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
196206
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
197-
diff = list(diff_sets(rows1, rows2))
207+
json_cols = {i: colname for i, colname in enumerate(table1.extra_columns)
208+
if isinstance(table1._schema[colname], JSONType)}
209+
diff = list(diff_sets(rows1, rows2, json_cols))
198210

199211
info_tree.info.set_diff(diff)
200212
info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}

data_diff/utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import re
34
from typing import Dict, Iterable, Sequence
@@ -144,3 +145,27 @@ def dbt_diff_string_template(
144145
string_output += f"\n{k}: {v}"
145146

146147
return string_output
148+
149+
150+
def _jsons_equiv(a: str, b: str):
151+
try:
152+
return json.loads(a) == json.loads(b)
153+
except (ValueError, TypeError, json.decoder.JSONDecodeError): # not valid jsons
154+
return False
155+
156+
157+
def diffs_are_equiv_jsons(diff: list, json_cols: dict):
158+
if (len(diff) != 2) or ({diff[0][0], diff[1][0]} != {'+', '-'}):
159+
return False
160+
match = True
161+
overriden_diff_cols = set()
162+
for i, (col_a, col_b) in enumerate(safezip(diff[0][1][1:], diff[1][1][1:])): # index 0 is extra_columns first elem
163+
# we only attempt to parse columns of JSONType, but we still need to check if non-json columns don't match
164+
match = col_a == col_b
165+
if not match and (i in json_cols):
166+
if _jsons_equiv(col_a, col_b):
167+
overriden_diff_cols.add(json_cols[i])
168+
match = True
169+
if not match:
170+
break
171+
return match, overriden_diff_cols

tests/test_database_types.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ def init_conns():
7474
"boolean": [
7575
"boolean",
7676
],
77+
"json": [
78+
"json",
79+
"jsonb"
80+
]
7781
},
7882
db.MySQL: {
7983
# https://dev.mysql.com/doc/refman/8.0/en/integer-types.html
@@ -199,6 +203,9 @@ def init_conns():
199203
"boolean": [
200204
"boolean",
201205
],
206+
"json": [
207+
"super",
208+
]
202209
},
203210
db.Oracle: {
204211
"int": [
@@ -469,12 +476,28 @@ def __iter__(self):
469476
return (uuid.uuid1(i) for i in range(self.max))
470477

471478

479+
class JsonFaker:
480+
MANUAL_FAKES = [
481+
'{"keyText": "text", "keyInt": 3, "keyFloat": 5.4445, "keyBoolean": true}',
482+
]
483+
484+
def __init__(self, max):
485+
self.max = max
486+
487+
def __iter__(self):
488+
return iter(self.MANUAL_FAKES[: self.max])
489+
490+
def __len__(self):
491+
return min(self.max, len(self.MANUAL_FAKES))
492+
493+
472494
TYPE_SAMPLES = {
473495
"int": IntFaker(N_SAMPLES),
474496
"datetime": DateTimeFaker(N_SAMPLES),
475497
"float": FloatFaker(N_SAMPLES),
476498
"uuid": UUID_Faker(N_SAMPLES),
477499
"boolean": BooleanFaker(N_SAMPLES),
500+
"json": JsonFaker(N_SAMPLES)
478501
}
479502

480503

@@ -546,7 +569,7 @@ def expand_params(testcase_func, param_num, param):
546569
return name
547570

548571

549-
def _insert_to_table(conn, table_path, values, type):
572+
def _insert_to_table(conn, table_path, values, coltype):
550573
tbl = table(table_path)
551574

552575
current_n_rows = conn.query(tbl.count(), int)
@@ -555,31 +578,34 @@ def _insert_to_table(conn, table_path, values, type):
555578
return
556579
elif current_n_rows > 0:
557580
conn.query(drop_table(table_name))
558-
_create_table_with_indexes(conn, table_path, type)
581+
_create_table_with_indexes(conn, table_path, coltype)
559582

560583
# if BENCHMARK and N_SAMPLES > 10_000:
561584
# description = f"{conn.name}: {table}"
562585
# values = rich.progress.track(values, total=N_SAMPLES, description=description)
563586

564-
if type == "boolean":
587+
if coltype == "boolean":
565588
values = [(i, bool(sample)) for i, sample in values]
566-
elif re.search(r"(time zone|tz)", type):
589+
elif re.search(r"(time zone|tz)", coltype):
567590
values = [(i, sample.replace(tzinfo=timezone.utc)) for i, sample in values]
568591

569592
if isinstance(conn, db.Clickhouse):
570-
if type.startswith("DateTime64"):
593+
if coltype.startswith("DateTime64"):
571594
values = [(i, f"{sample.replace(tzinfo=None)}") for i, sample in values]
572595

573-
elif type == "DateTime":
596+
elif coltype == "DateTime":
574597
# Clickhouse's DateTime does not allow to store micro/milli/nano seconds
575598
values = [(i, str(sample)[:19]) for i, sample in values]
576599

577-
elif type.startswith("Decimal("):
578-
precision = int(type[8:].rstrip(")").split(",")[1])
600+
elif coltype.startswith("Decimal("):
601+
precision = int(coltype[8:].rstrip(")").split(",")[1])
579602
values = [(i, round(sample, precision)) for i, sample in values]
580-
elif isinstance(conn, db.BigQuery) and type == "datetime":
603+
elif isinstance(conn, db.BigQuery) and coltype == "datetime":
581604
values = [(i, Code(f"cast(timestamp '{sample}' as datetime)")) for i, sample in values]
582605

606+
if isinstance(conn, db.Redshift) and coltype == "json":
607+
values = [(i, Code(f"JSON_PARSE('{sample}')")) for i, sample in values]
608+
583609
insert_rows_in_batches(conn, tbl, values, columns=["id", "col"])
584610
conn.query(commit)
585611

0 commit comments

Comments
 (0)