1
1
import os
2
+ import re
2
3
import time
3
4
import webbrowser
4
- from typing import List , Optional , Dict
5
+ from typing import List , Optional , Dict , Tuple , Union
5
6
import keyring
6
-
7
7
import pydantic
8
8
import rich
9
- from rich .prompt import Confirm
9
+ from rich .prompt import Confirm , Prompt
10
+
11
+ from data_diff .errors import DataDiffCustomSchemaNoConfigError , DataDiffDbtProjectVarsNotFoundError
10
12
11
13
from . import connect_to_table , diff_tables , Algorithm
12
14
from .cloud import DatafoldAPI , TCloudApiDataDiff , TCloudApiOrgMeta , get_or_create_data_source
13
- from .dbt_parser import DbtParser , PROJECT_FILE
15
+ from .dbt_parser import DbtParser , PROJECT_FILE , TDatadiffConfig
14
16
from .tracking import (
17
+ bool_ask_for_email ,
18
+ create_email_signup_event_json ,
15
19
set_entrypoint_name ,
16
20
set_dbt_user_id ,
17
21
set_dbt_version ,
@@ -52,24 +56,21 @@ def dbt_diff(
52
56
project_dir_override : Optional [str ] = None ,
53
57
is_cloud : bool = False ,
54
58
dbt_selection : Optional [str ] = None ,
59
+ state : Optional [str ] = None ,
55
60
) -> None :
56
61
print_version_info ()
57
62
diff_threads = []
58
63
set_entrypoint_name ("CLI-dbt" )
59
- dbt_parser = DbtParser (profiles_dir_override , project_dir_override )
64
+ dbt_parser = DbtParser (profiles_dir_override , project_dir_override , state )
60
65
models = dbt_parser .get_models (dbt_selection )
61
- datadiff_variables = dbt_parser .get_datadiff_variables ()
62
- config_prod_database = datadiff_variables .get ("prod_database" )
63
- config_prod_schema = datadiff_variables .get ("prod_schema" )
64
- config_prod_custom_schema = datadiff_variables .get ("prod_custom_schema" )
65
- datasource_id = datadiff_variables .get ("datasource_id" )
66
- set_dbt_user_id (dbt_parser .dbt_user_id )
67
- set_dbt_version (dbt_parser .dbt_version )
68
- set_dbt_project_id (dbt_parser .dbt_project_id )
69
-
70
- if datadiff_variables .get ("custom_schemas" ) is not None :
71
- logger .warning (
72
- "vars: data_diff: custom_schemas: is no longer used and can be removed.\n To utilize custom schemas, see the documentation here: https://docs.datafold.com/development_testing/open_source"
66
+ config = dbt_parser .get_datadiff_config ()
67
+ _initialize_events (dbt_parser .dbt_user_id , dbt_parser .dbt_version , dbt_parser .dbt_project_id )
68
+
69
+
70
+ if not state and not (config .prod_database or config .prod_schema ):
71
+ doc_url = "https://docs.datafold.com/development_testing/open_source#configure-your-dbt-project"
72
+ raise DataDiffDbtProjectVarsNotFoundError (
73
+ f"""vars: data_diff: section not found in dbt_project.yml.\n \n To solve this, please configure your dbt project: \n { doc_url } \n \n Or specify a production manifest using the `--state` flag."""
73
74
)
74
75
75
76
if is_cloud :
@@ -79,13 +80,13 @@ def dbt_diff(
79
80
return
80
81
org_meta = api .get_org_meta ()
81
82
82
- if datasource_id is None :
83
+ if config . datasource_id is None :
83
84
rich .print ("[red]Data source ID not found in dbt_project.yml" )
84
85
is_create_data_source = Confirm .ask ("Would you like to create a new data source?" )
85
86
if is_create_data_source :
86
- datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
87
+ config . datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
87
88
rich .print (f'To use the data source in next runs, please, update your "{ PROJECT_FILE } " with a block:' )
88
- rich .print (f"[green]vars:\n data_diff:\n datasource_id: { datasource_id } \n " )
89
+ rich .print (f"[green]vars:\n data_diff:\n datasource_id: { config . datasource_id } \n " )
89
90
rich .print (
90
91
"Read more about Datafold vars in docs: "
91
92
"https://docs.datafold.com/os_diff/dbt_integration/#configure-a-data-source\n "
@@ -96,21 +97,29 @@ def dbt_diff(
96
97
"\n vars:\n data_diff:\n datasource_id: 1234"
97
98
)
98
99
99
- data_source = api .get_data_source (datasource_id )
100
+ data_source = api .get_data_source (config . datasource_id )
100
101
dbt_parser .set_casing_policy_for (connection_type = data_source .type )
101
102
rich .print ("[green][bold]\n Diffs in progress...[/][/]\n " )
102
103
103
104
else :
104
105
dbt_parser .set_connection ()
105
106
106
107
for model in models :
107
- diff_vars = _get_diff_vars (
108
- dbt_parser , config_prod_database , config_prod_schema , config_prod_custom_schema , model
109
- )
108
+ diff_vars = _get_diff_vars (dbt_parser , config , model )
109
+
110
+ # we won't always have a prod path when using state
111
+ # when the model DNE in prod manifest, skip the model diff
112
+ if (
113
+ state and len (diff_vars .prod_path ) < 2
114
+ ): # < 2 because some providers like databricks can legitimately have *only* 2
115
+ diff_output_str = _diff_output_base ("." .join (diff_vars .dev_path ), "." .join (diff_vars .prod_path ))
116
+ diff_output_str += "[green]New model: nothing to diff![/] \n "
117
+ rich .print (diff_output_str )
118
+ continue
110
119
111
120
if diff_vars .primary_keys :
112
121
if is_cloud :
113
- diff_thread = run_as_daemon (_cloud_diff , diff_vars , datasource_id , api , org_meta )
122
+ diff_thread = run_as_daemon (_cloud_diff , diff_vars , config . datasource_id , api , org_meta )
114
123
diff_threads .append (diff_thread )
115
124
else :
116
125
_local_diff (diff_vars )
@@ -128,41 +137,19 @@ def dbt_diff(
128
137
129
138
def _get_diff_vars (
130
139
dbt_parser : "DbtParser" ,
131
- config_prod_database : Optional [str ],
132
- config_prod_schema : Optional [str ],
133
- config_prod_custom_schema : Optional [str ],
140
+ config : TDatadiffConfig ,
134
141
model ,
135
142
) -> TDiffVars :
136
143
dev_database = model .database
137
144
dev_schema = model .schema_
138
145
139
146
primary_keys = dbt_parser .get_pk_from_model (model , dbt_parser .unique_columns , "primary-key" )
140
147
141
- # "custom" dbt config database
142
- if model .config .database :
143
- prod_database = model .config .database
144
- elif config_prod_database :
145
- prod_database = config_prod_database
146
- else :
147
- prod_database = dev_database
148
-
149
- # prod schema name differs from dev schema name
150
- if config_prod_schema :
151
- custom_schema = model .config .schema_
152
-
153
- # the model has a custom schema config(schema='some_schema')
154
- if custom_schema :
155
- if not config_prod_custom_schema :
156
- raise ValueError (
157
- f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value!\n "
158
- + "For more details see: https://docs.datafold.com/development_testing/open_source"
159
- )
160
- prod_schema = config_prod_custom_schema .replace ("<custom_schema>" , custom_schema )
161
- # no custom schema, use the default
162
- else :
163
- prod_schema = config_prod_schema
148
+ # prod path is constructed via configuration or the prod manifest via --state
149
+ if dbt_parser .prod_manifest_obj :
150
+ prod_database , prod_schema = _get_prod_path_from_manifest (model , dbt_parser .prod_manifest_obj )
164
151
else :
165
- prod_schema = dev_schema
152
+ prod_database , prod_schema = _get_prod_path_from_config ( config , model , dev_database , dev_schema )
166
153
167
154
if dbt_parser .requires_upper :
168
155
dev_qualified_list = [x .upper () for x in [dev_database , dev_schema , model .alias ] if x ]
@@ -186,6 +173,45 @@ def _get_diff_vars(
186
173
)
187
174
188
175
176
+ def _get_prod_path_from_config (config , model , dev_database , dev_schema ) -> Tuple [str , str ]:
177
+ # "custom" dbt config database
178
+ if model .config .database :
179
+ prod_database = model .config .database
180
+ elif config .prod_database :
181
+ prod_database = config .prod_database
182
+ else :
183
+ prod_database = dev_database
184
+
185
+ # prod schema name differs from dev schema name
186
+ if config .prod_schema :
187
+ custom_schema = model .config .schema_
188
+
189
+ # the model has a custom schema config(schema='some_schema')
190
+ if custom_schema :
191
+ if not config .prod_custom_schema :
192
+ raise DataDiffCustomSchemaNoConfigError (
193
+ f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value or utilize the `--state` flag!\n \n "
194
+ + "For more details see: https://docs.datafold.com/development_testing/open_source"
195
+ )
196
+ prod_schema = config .prod_custom_schema .replace ("<custom_schema>" , custom_schema )
197
+ # no custom schema, use the default
198
+ else :
199
+ prod_schema = config .prod_schema
200
+ else :
201
+ prod_schema = dev_schema
202
+ return prod_database , prod_schema
203
+
204
+
205
+ def _get_prod_path_from_manifest (model , prod_manifest ) -> Union [Tuple [str , str ], Tuple [None , None ]]:
206
+ prod_database = None
207
+ prod_schema = None
208
+ prod_model = prod_manifest .nodes .get (model .unique_id , None )
209
+ if prod_model :
210
+ prod_database = prod_model .database
211
+ prod_schema = prod_model .schema_
212
+ return prod_database , prod_schema
213
+
214
+
189
215
def _local_diff (diff_vars : TDiffVars ) -> None :
190
216
dev_qualified_str = "." .join (diff_vars .dev_path )
191
217
prod_qualified_str = "." .join (diff_vars .prod_path )
@@ -389,3 +415,34 @@ def _cloud_diff(diff_vars: TDiffVars, datasource_id: int, api: DatafoldAPI, org_
389
415
390
416
def _diff_output_base (dev_path : str , prod_path : str ) -> str :
391
417
return f"\n [green]{ prod_path } <> { dev_path } [/] \n "
418
+
419
+
420
+ def _initialize_events (dbt_user_id : Optional [str ], dbt_version : Optional [str ], dbt_project_id : Optional [str ]) -> None :
421
+ set_dbt_user_id (dbt_user_id )
422
+ set_dbt_version (dbt_version )
423
+ set_dbt_project_id (dbt_project_id )
424
+ _email_signup ()
425
+
426
+
427
+ def _email_signup () -> None :
428
+ email_regex = r'^[\w\.\+-]+@[\w\.-]+\.\w+$'
429
+ prompt = "\n Would you like to be notified when a new data-diff version is available?\n \n Enter email or leave blank to opt out (we'll only ask once).\n "
430
+
431
+ if bool_ask_for_email ():
432
+ while True :
433
+ email_input = Prompt .ask (
434
+ prompt = prompt ,
435
+ default = "" ,
436
+ show_default = False ,
437
+ )
438
+ email = email_input .strip ()
439
+
440
+ if email == "" or re .match (email_regex , email ):
441
+ break
442
+
443
+ prompt = ""
444
+ rich .print ("[red]Invalid email. Please enter a valid email or leave it blank to opt out.[/]" )
445
+
446
+ if email :
447
+ event_json = create_email_signup_event_json (email )
448
+ run_as_daemon (send_event_json , event_json )
0 commit comments