Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit cac665b

Browse files
sungchun12Sung Won Chungdlawin
authored
Evolve dbt data diff output (#857)
* first draft * style fixes by ruff * past tense consistency * working draft of new table * style fixes by ruff * dbt diffs work, cloud broken for now * remove cached git repos * efficient naming * add type changed count * reorder for priority on prod changes * tabulate value diffs * style fixes by ruff * less horizontal space needed * leo's feedback * center align values * consistent formatting * shorter name same meaning * row counts and diff values working * deps impacts works now * default val * more readable * add primary key used * add model specific CI configs * consistency * conditional headers * style fixes by ruff * cleaner implementation * more cleaning * consistent format * fix unchanged calc * remove prints * default value * draft up tests * a couple more tests * new version * passing tests * style fixes by ruff * util unit test * add unit tests * test the templates * fix type hints * real test no mocking * update tests with all the new outputs * add more validations for mock * fix json bug --------- Co-authored-by: Sung Won Chung <[email protected]> Co-authored-by: sungchun12 <[email protected]> Co-authored-by: Dan Lawin <[email protected]>
1 parent d4ca0e6 commit cac665b

File tree

11 files changed

+304
-61
lines changed

11 files changed

+304
-61
lines changed

Diff for: data_diff/cloud/datafold_api.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,22 @@ class TSummaryResultSchemaStats(pydantic.BaseModel):
144144
exclusive_columns: Tuple[List[str], List[str]]
145145

146146

147+
class TSummaryResultDependencyDetails(pydantic.BaseModel):
148+
deps: Dict[str, List[Dict]]
149+
150+
147151
class TCloudApiDataDiffSummaryResult(pydantic.BaseModel):
148152
status: str
149153
pks: Optional[TSummaryResultPrimaryKeyStats]
150154
values: Optional[TSummaryResultValueStats]
151155
schema_: Optional[TSummaryResultSchemaStats]
152-
dependencies: Optional[Dict[str, Any]]
156+
deps: Optional[TSummaryResultDependencyDetails]
153157

154158
@classmethod
155159
def from_orm(cls, obj: Any) -> Self:
156160
pks = TSummaryResultPrimaryKeyStats(**obj["pks"]) if "pks" in obj else None
157161
values = TSummaryResultValueStats(**obj["values"]) if "values" in obj else None
158-
deps = obj["deps"] if "deps" in obj else None
162+
deps = TSummaryResultDependencyDetails(**obj["dependencies"]) if "dependencies" in obj else None
159163
schema = TSummaryResultSchemaStats(**obj["schema"]) if "schema" in obj else None
160164
return cls(
161165
status=obj["status"],

Diff for: data_diff/dbt.py

+57-20
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,23 @@ def _local_diff(
306306
k for k, v in table2_columns.items() if k in table1_columns and v.data_type != table1_columns[k].data_type
307307
}
308308

309-
if columns_added:
310-
diff_output_str += columns_added_template(columns_added)
309+
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"
310+
311+
if diff_vars.where_filter:
312+
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"
313+
314+
if diff_vars.include_columns:
315+
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"
316+
317+
if diff_vars.exclude_columns:
318+
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"
311319

312320
if columns_removed:
313321
diff_output_str += columns_removed_template(columns_removed)
314322

323+
if columns_added:
324+
diff_output_str += columns_added_template(columns_added)
325+
315326
if columns_type_changed:
316327
diff_output_str += columns_type_changed_template(columns_type_changed)
317328
column_set = column_set.difference(columns_type_changed)
@@ -349,13 +360,14 @@ def _local_diff(
349360
return
350361

351362
dataset1_columns = [
352-
(name, type_, table1.database.dialect.parse_type(table1.table_path, name, type_, *other))
353-
for (name, type_, *other) in table1_columns.values()
363+
(info.column_name, info.data_type, table1.database.dialect.parse_type(table1.table_path, info))
364+
for info in table1_columns.values()
354365
]
355366
dataset2_columns = [
356-
(name, type_, table2.database.dialect.parse_type(table2.table_path, name, type_, *other))
357-
for (name, type_, *other) in table2_columns.values()
367+
(info.column_name, info.data_type, table2.database.dialect.parse_type(table2.table_path, info))
368+
for info in table2_columns.values()
358369
]
370+
359371
print(
360372
json.dumps(
361373
jsonify(
@@ -455,32 +467,57 @@ def _cloud_diff(
455467
rows_removed_count = diff_results.pks.exclusives[0]
456468

457469
rows_updated = diff_results.values.rows_with_differences
458-
total_rows = diff_results.values.total_rows
459-
rows_unchanged = int(total_rows) - int(rows_updated)
470+
total_rows_table1 = diff_results.pks.total_rows[0]
471+
total_rows_table2 = diff_results.pks.total_rows[1]
472+
total_rows_diff = total_rows_table2 - total_rows_table1
473+
474+
rows_unchanged = int(total_rows_table1) - int(rows_updated) - int(rows_removed_count)
460475
diff_percent_list = {
461-
x.column_name: str(x.match) + "%" for x in diff_results.values.columns_diff_stats if x.match != 100.0
476+
x.column_name: f"{str(round(100.00 - x.match, 2))}%"
477+
for x in diff_results.values.columns_diff_stats
478+
if x.match != 100.0
462479
}
463-
columns_added = diff_results.schema_.exclusive_columns[1]
464-
columns_removed = diff_results.schema_.exclusive_columns[0]
480+
columns_added = set(diff_results.schema_.exclusive_columns[1])
481+
columns_removed = set(diff_results.schema_.exclusive_columns[0])
465482
column_type_changes = diff_results.schema_.column_type_differs
466483

467-
if columns_added:
468-
diff_output_str += columns_added_template(columns_added)
484+
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"
485+
if diff_vars.where_filter:
486+
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"
487+
488+
if diff_vars.include_columns:
489+
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"
490+
491+
if diff_vars.exclude_columns:
492+
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"
469493

470494
if columns_removed:
471495
diff_output_str += columns_removed_template(columns_removed)
472496

497+
if columns_added:
498+
diff_output_str += columns_added_template(columns_added)
499+
473500
if column_type_changes:
474501
diff_output_str += columns_type_changed_template(column_type_changes)
475502

503+
deps_impacts = {
504+
key: len(value) + sum(len(item.get("BiHtSync", [])) for item in value) if key == "hightouch" else len(value)
505+
for key, value in diff_results.deps.deps.items()
506+
}
507+
476508
if any([rows_added_count, rows_removed_count, rows_updated]):
477509
diff_output = dbt_diff_string_template(
478-
rows_added_count,
479-
rows_removed_count,
480-
rows_updated,
481-
str(rows_unchanged),
482-
diff_percent_list,
483-
"Value Match Percent:",
510+
total_rows_table1=total_rows_table1,
511+
total_rows_table2=total_rows_table2,
512+
total_rows_diff=total_rows_diff,
513+
rows_added=rows_added_count,
514+
rows_removed=rows_removed_count,
515+
rows_updated=rows_updated,
516+
rows_unchanged=str(rows_unchanged),
517+
deps_impacts=deps_impacts,
518+
is_cloud=True,
519+
extra_info_dict=diff_percent_list,
520+
extra_info_str="Value Changed:",
484521
)
485522
diff_output_str += f"\n{diff_url}\n {diff_output} \n"
486523
rich.print(diff_output_str)
@@ -524,7 +561,7 @@ def _cloud_diff(
524561

525562

526563
def _diff_output_base(dev_path: str, prod_path: str) -> str:
527-
return f"\n[green]{prod_path} <> {dev_path}[/] \n"
564+
return f"\n[blue]{prod_path}[/] <> [green]{dev_path}[/] \n"
528565

529566

530567
def _initialize_events(dbt_user_id: Optional[str], dbt_version: Optional[str], dbt_project_id: Optional[str]) -> None:

Diff for: data_diff/diff_tables.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,19 @@ def _get_stats(self, is_dbt: bool = False) -> DiffStats:
138138
def get_stats_string(self, is_dbt: bool = False):
139139
diff_stats = self._get_stats(is_dbt)
140140

141+
total_rows_diff = diff_stats.table2_count - diff_stats.table1_count
142+
141143
if is_dbt:
142144
string_output = dbt_diff_string_template(
143-
diff_stats.diff_by_sign["+"],
144-
diff_stats.diff_by_sign["-"],
145-
diff_stats.diff_by_sign["!"],
146-
diff_stats.unchanged,
147-
diff_stats.extra_column_diffs,
148-
"Values Updated:",
145+
total_rows_table1=diff_stats.table1_count,
146+
total_rows_table2=diff_stats.table2_count,
147+
total_rows_diff=total_rows_diff,
148+
rows_added=diff_stats.diff_by_sign["+"],
149+
rows_removed=diff_stats.diff_by_sign["-"],
150+
rows_updated=diff_stats.diff_by_sign["!"],
151+
rows_unchanged=diff_stats.unchanged,
152+
extra_info_dict=diff_stats.extra_column_diffs,
153+
extra_info_str="[u]Values Changed[/u]",
149154
)
150155

151156
else:

Diff for: data_diff/utils.py

+53-13
Original file line numberDiff line numberDiff line change
@@ -459,19 +459,59 @@ def __repr__(self) -> str:
459459

460460

461461
def dbt_diff_string_template(
462-
rows_added: str, rows_removed: str, rows_updated: str, rows_unchanged: str, extra_info_dict: Dict, extra_info_str
462+
total_rows_table1: int,
463+
total_rows_table2: int,
464+
total_rows_diff: int,
465+
rows_added: int,
466+
rows_removed: int,
467+
rows_updated: int,
468+
rows_unchanged: int,
469+
extra_info_dict: Dict,
470+
extra_info_str: str,
471+
is_cloud: Optional[bool] = False,
472+
deps_impacts: Optional[Dict] = None,
463473
) -> str:
464-
string_output = f"\n{tabulate([[rows_added, rows_removed]], headers=['Rows Added', 'Rows Removed'])}"
474+
# main table
475+
main_rows = [
476+
["Total", total_rows_table1, "", f"{total_rows_table2} [{diff_int_dynamic_color_template(total_rows_diff)}]"],
477+
["Added", "", diff_int_dynamic_color_template(rows_added), ""],
478+
["Removed", "", diff_int_dynamic_color_template(-rows_removed), ""],
479+
["Different", "", rows_updated, ""],
480+
["Unchanged", "", rows_unchanged, ""],
481+
]
482+
483+
main_headers = ["rows", "PROD", "<>", "DEV"]
484+
main_table = tabulate(main_rows, headers=main_headers)
485+
486+
# diffs table
487+
diffs_rows = sorted(list(extra_info_dict.items()))
488+
489+
diffs_headers = ["columns", "% diff values" if is_cloud else "# diff values"]
490+
diffs_table = tabulate(diffs_rows, headers=diffs_headers)
491+
492+
# deps impacts table
493+
deps_impacts_table = ""
494+
if deps_impacts:
495+
deps_impacts_rows = list(deps_impacts.items())
496+
deps_impacts_headers = ["deps", "# data assets"]
497+
deps_impacts_table = f"\n\n{tabulate(deps_impacts_rows, headers=deps_impacts_headers)}"
498+
499+
# combine all tables
500+
string_output = f"\n{main_table}\n\n{diffs_table}{deps_impacts_table}"
465501

466-
string_output += f"\n\nUpdated Rows: {rows_updated}\n"
467-
string_output += f"Unchanged Rows: {rows_unchanged}\n\n"
502+
return string_output
468503

469-
string_output += extra_info_str
470504

471-
for k, v in extra_info_dict.items():
472-
string_output += f"\n{k}: {v}"
505+
def diff_int_dynamic_color_template(diff_value: int) -> str:
506+
if not isinstance(diff_value, int):
507+
return diff_value
473508

474-
return string_output
509+
if diff_value > 0:
510+
return f"[green]+{diff_value}[/]"
511+
elif diff_value < 0:
512+
return f"[red]{diff_value}[/]"
513+
else:
514+
return "0"
475515

476516

477517
def _jsons_equiv(a: str, b: str):
@@ -498,18 +538,18 @@ def diffs_are_equiv_jsons(diff: list, json_cols: dict):
498538
return match, overriden_diff_cols
499539

500540

501-
def columns_removed_template(columns_removed) -> str:
502-
columns_removed_str = f"Column(s) removed: {columns_removed}\n"
541+
def columns_removed_template(columns_removed: set) -> str:
542+
columns_removed_str = f"[red]Columns removed [-{len(columns_removed)}]:[/] [blue]{columns_removed}[/]\n"
503543
return columns_removed_str
504544

505545

506-
def columns_added_template(columns_added) -> str:
507-
columns_added_str = f"Column(s) added: {columns_added}\n"
546+
def columns_added_template(columns_added: set) -> str:
547+
columns_added_str = f"[green]Columns added [+{len(columns_added)}]: {columns_added}[/]\n"
508548
return columns_added_str
509549

510550

511551
def columns_type_changed_template(columns_type_changed) -> str:
512-
columns_type_changed_str = f"Type change: {columns_type_changed}\n"
552+
columns_type_changed_str = f"Type changed [{len(columns_type_changed)}]: [green]{columns_type_changed}[/]\n"
513553
return columns_type_changed_str
514554

515555

Diff for: data_diff/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.10.1"
1+
__version__ = "0.11.0"

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "data-diff"
3-
version = "0.10.1"
3+
version = "0.11.0"
44
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
55
authors = ["Datafold <[email protected]>"]
66
license = "MIT"

Diff for: tests/dbt_artifacts/jaffle_shop.duckdb

0 Bytes
Binary file not shown.

Diff for: tests/dbt_artifacts/target/manifest.json

+1-1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)