11import json
22import os
3+ import re
34import time
45import webbrowser
5- from typing import List , Optional , Dict
6+ from typing import List , Optional , Dict , Tuple , Union
67import keyring
7-
88import pydantic
99import rich
10- from rich .prompt import Confirm
10+ from rich .prompt import Confirm , Prompt
11+
12+ from data_diff .errors import DataDiffCustomSchemaNoConfigError , DataDiffDbtProjectVarsNotFoundError
1113
1214from . import connect_to_table , diff_tables , Algorithm
1315from .cloud import DatafoldAPI , TCloudApiDataDiff , TCloudApiOrgMeta , get_or_create_data_source
14- from .dbt_parser import DbtParser , PROJECT_FILE
16+ from .dbt_parser import DbtParser , PROJECT_FILE , TDatadiffConfig
1517from .diff_tables import DiffResultWrapper
1618from .format import jsonify
1719from .tracking import (
20+ bool_ask_for_email ,
21+ create_email_signup_event_json ,
1822 set_entrypoint_name ,
1923 set_dbt_user_id ,
2024 set_dbt_version ,
@@ -56,24 +60,21 @@ def dbt_diff(
5660 is_cloud : bool = False ,
5761 dbt_selection : Optional [str ] = None ,
5862 json_output : bool = False ,
63+ state : Optional [str ] = None ,
5964) -> None :
6065 print_version_info ()
6166 diff_threads = []
6267 set_entrypoint_name ("CLI-dbt" )
63- dbt_parser = DbtParser (profiles_dir_override , project_dir_override )
68+ dbt_parser = DbtParser (profiles_dir_override , project_dir_override , state )
6469 models = dbt_parser .get_models (dbt_selection )
65- datadiff_variables = dbt_parser .get_datadiff_variables ()
66- config_prod_database = datadiff_variables .get ("prod_database" )
67- config_prod_schema = datadiff_variables .get ("prod_schema" )
68- config_prod_custom_schema = datadiff_variables .get ("prod_custom_schema" )
69- datasource_id = datadiff_variables .get ("datasource_id" )
70- set_dbt_user_id (dbt_parser .dbt_user_id )
71- set_dbt_version (dbt_parser .dbt_version )
72- set_dbt_project_id (dbt_parser .dbt_project_id )
73-
74- if datadiff_variables .get ("custom_schemas" ) is not None :
75- logger .warning (
76- "vars: data_diff: custom_schemas: is no longer used and can be removed.\n To utilize custom schemas, see the documentation here: https://docs.datafold.com/development_testing/open_source"
70+ config = dbt_parser .get_datadiff_config ()
71+ _initialize_events (dbt_parser .dbt_user_id , dbt_parser .dbt_version , dbt_parser .dbt_project_id )
72+
73+
74+ if not state and not (config .prod_database or config .prod_schema ):
75+ doc_url = "https://docs.datafold.com/development_testing/open_source#configure-your-dbt-project"
76+ raise DataDiffDbtProjectVarsNotFoundError (
77+ f"""vars: data_diff: section not found in dbt_project.yml.\n \n To solve this, please configure your dbt project: \n { doc_url } \n \n Or specify a production manifest using the `--state` flag."""
7778 )
7879
7980 if is_cloud :
@@ -83,13 +84,13 @@ def dbt_diff(
8384 return
8485 org_meta = api .get_org_meta ()
8586
86- if datasource_id is None :
87+ if config . datasource_id is None :
8788 rich .print ("[red]Data source ID not found in dbt_project.yml" )
8889 is_create_data_source = Confirm .ask ("Would you like to create a new data source?" )
8990 if is_create_data_source :
90- datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
91+ config . datasource_id = get_or_create_data_source (api = api , dbt_parser = dbt_parser )
9192 rich .print (f'To use the data source in next runs, please, update your "{ PROJECT_FILE } " with a block:' )
92- rich .print (f"[green]vars:\n data_diff:\n datasource_id: { datasource_id } \n " )
93+ rich .print (f"[green]vars:\n data_diff:\n datasource_id: { config . datasource_id } \n " )
9394 rich .print (
9495 "Read more about Datafold vars in docs: "
9596 "https://docs.datafold.com/os_diff/dbt_integration/#configure-a-data-source\n "
@@ -100,21 +101,29 @@ def dbt_diff(
100101 "\n vars:\n data_diff:\n datasource_id: 1234"
101102 )
102103
103- data_source = api .get_data_source (datasource_id )
104+ data_source = api .get_data_source (config . datasource_id )
104105 dbt_parser .set_casing_policy_for (connection_type = data_source .type )
105106 rich .print ("[green][bold]\n Diffs in progress...[/][/]\n " )
106107
107108 else :
108109 dbt_parser .set_connection ()
109110
110111 for model in models :
111- diff_vars = _get_diff_vars (
112- dbt_parser , config_prod_database , config_prod_schema , config_prod_custom_schema , model
113- )
112+ diff_vars = _get_diff_vars (dbt_parser , config , model )
113+
114+ # we won't always have a prod path when using state
115+ # when the model DNE in prod manifest, skip the model diff
116+ if (
117+ state and len (diff_vars .prod_path ) < 2
118+ ): # < 2 because some providers like databricks can legitimately have *only* 2
119+ diff_output_str = _diff_output_base ("." .join (diff_vars .dev_path ), "." .join (diff_vars .prod_path ))
120+ diff_output_str += "[green]New model: nothing to diff![/] \n "
121+ rich .print (diff_output_str )
122+ continue
114123
115124 if diff_vars .primary_keys :
116125 if is_cloud :
117- diff_thread = run_as_daemon (_cloud_diff , diff_vars , datasource_id , api , org_meta )
126+ diff_thread = run_as_daemon (_cloud_diff , diff_vars , config . datasource_id , api , org_meta )
118127 diff_threads .append (diff_thread )
119128 else :
120129 _local_diff (diff_vars , json_output )
@@ -132,41 +141,19 @@ def dbt_diff(
132141
133142def _get_diff_vars (
134143 dbt_parser : "DbtParser" ,
135- config_prod_database : Optional [str ],
136- config_prod_schema : Optional [str ],
137- config_prod_custom_schema : Optional [str ],
144+ config : TDatadiffConfig ,
138145 model ,
139146) -> TDiffVars :
140147 dev_database = model .database
141148 dev_schema = model .schema_
142149
143150 primary_keys = dbt_parser .get_pk_from_model (model , dbt_parser .unique_columns , "primary-key" )
144151
145- # "custom" dbt config database
146- if model .config .database :
147- prod_database = model .config .database
148- elif config_prod_database :
149- prod_database = config_prod_database
150- else :
151- prod_database = dev_database
152-
153- # prod schema name differs from dev schema name
154- if config_prod_schema :
155- custom_schema = model .config .schema_
156-
157- # the model has a custom schema config(schema='some_schema')
158- if custom_schema :
159- if not config_prod_custom_schema :
160- raise ValueError (
161- f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value!\n "
162- + "For more details see: https://docs.datafold.com/development_testing/open_source"
163- )
164- prod_schema = config_prod_custom_schema .replace ("<custom_schema>" , custom_schema )
165- # no custom schema, use the default
166- else :
167- prod_schema = config_prod_schema
152+ # prod path is constructed via configuration or the prod manifest via --state
153+ if dbt_parser .prod_manifest_obj :
154+ prod_database , prod_schema = _get_prod_path_from_manifest (model , dbt_parser .prod_manifest_obj )
168155 else :
169- prod_schema = dev_schema
156+ prod_database , prod_schema = _get_prod_path_from_config ( config , model , dev_database , dev_schema )
170157
171158 if dbt_parser .requires_upper :
172159 dev_qualified_list = [x .upper () for x in [dev_database , dev_schema , model .alias ] if x ]
@@ -190,6 +177,46 @@ def _get_diff_vars(
190177 )
191178
192179
180+
181+ def _get_prod_path_from_config (config , model , dev_database , dev_schema ) -> Tuple [str , str ]:
182+ # "custom" dbt config database
183+ if model .config .database :
184+ prod_database = model .config .database
185+ elif config .prod_database :
186+ prod_database = config .prod_database
187+ else :
188+ prod_database = dev_database
189+
190+ # prod schema name differs from dev schema name
191+ if config .prod_schema :
192+ custom_schema = model .config .schema_
193+
194+ # the model has a custom schema config(schema='some_schema')
195+ if custom_schema :
196+ if not config .prod_custom_schema :
197+ raise DataDiffCustomSchemaNoConfigError (
198+ f"Found a custom schema on model { model .name } , but no value for\n vars:\n data_diff:\n prod_custom_schema:\n Please set a value or utilize the `--state` flag!\n \n "
199+ + "For more details see: https://docs.datafold.com/development_testing/open_source"
200+ )
201+ prod_schema = config .prod_custom_schema .replace ("<custom_schema>" , custom_schema )
202+ # no custom schema, use the default
203+ else :
204+ prod_schema = config .prod_schema
205+ else :
206+ prod_schema = dev_schema
207+ return prod_database , prod_schema
208+
209+
210+ def _get_prod_path_from_manifest (model , prod_manifest ) -> Union [Tuple [str , str ], Tuple [None , None ]]:
211+ prod_database = None
212+ prod_schema = None
213+ prod_model = prod_manifest .nodes .get (model .unique_id , None )
214+ if prod_model :
215+ prod_database = prod_model .database
216+ prod_schema = prod_model .schema_
217+ return prod_database , prod_schema
218+
219+
193220def _local_diff (diff_vars : TDiffVars , json_output : bool = False ) -> None :
194221 dev_qualified_str = "." .join (diff_vars .dev_path )
195222 prod_qualified_str = "." .join (diff_vars .prod_path )
@@ -403,3 +430,34 @@ def _cloud_diff(diff_vars: TDiffVars, datasource_id: int, api: DatafoldAPI, org_
403430
404431def _diff_output_base (dev_path : str , prod_path : str ) -> str :
405432 return f"\n [green]{ prod_path } <> { dev_path } [/] \n "
433+
434+
435+ def _initialize_events (dbt_user_id : Optional [str ], dbt_version : Optional [str ], dbt_project_id : Optional [str ]) -> None :
436+ set_dbt_user_id (dbt_user_id )
437+ set_dbt_version (dbt_version )
438+ set_dbt_project_id (dbt_project_id )
439+ _email_signup ()
440+
441+
442+ def _email_signup () -> None :
443+ email_regex = r'^[\w\.\+-]+@[\w\.-]+\.\w+$'
444+ prompt = "\n Would you like to be notified when a new data-diff version is available?\n \n Enter email or leave blank to opt out (we'll only ask once).\n "
445+
446+ if bool_ask_for_email ():
447+ while True :
448+ email_input = Prompt .ask (
449+ prompt = prompt ,
450+ default = "" ,
451+ show_default = False ,
452+ )
453+ email = email_input .strip ()
454+
455+ if email == "" or re .match (email_regex , email ):
456+ break
457+
458+ prompt = ""
459+ rich .print ("[red]Invalid email. Please enter a valid email or leave it blank to opt out.[/]" )
460+
461+ if email :
462+ event_json = create_email_signup_event_json (email )
463+ run_as_daemon (send_event_json , event_json )
0 commit comments