Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit c3ae9b2

Browse files
committed
Merge branch 'master' into json-output
2 parents 7fbada3 + 2f541ec commit c3ae9b2

13 files changed

Lines changed: 555 additions & 282 deletions

File tree

data_diff/__main__.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,14 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
234234
"-s",
235235
default=None,
236236
metavar="PATH",
237-
help="select dbt resources to compare using dbt selection syntax",
237+
help="select dbt resources to compare using dbt selection syntax.",
238+
)
239+
@click.option(
240+
"--state",
241+
"-s",
242+
default=None,
243+
metavar="PATH",
244+
help="Specify manifest to utilize for 'prod' comparison paths instead of using configuration.",
238245
)
239246
def main(conf, run, **kw):
240247
if kw["table2"] is None and kw["database2"]:
@@ -267,6 +274,9 @@ def main(conf, run, **kw):
267274
logging.basicConfig(level=logging.WARNING, format=LOG_FORMAT, datefmt=DATE_FORMAT)
268275

269276
try:
277+
state = kw.pop("state", None)
278+
if state:
279+
state = os.path.expanduser(state)
270280
profiles_dir_override = kw.pop("dbt_profiles_dir", None)
271281
if profiles_dir_override:
272282
profiles_dir_override = os.path.expanduser(profiles_dir_override)
@@ -279,12 +289,13 @@ def main(conf, run, **kw):
279289
project_dir_override=project_dir_override,
280290
is_cloud=kw["cloud"],
281291
dbt_selection=kw["select"],
282-
json_output=kw["json_output"]
292+
json_output=kw["json_output"],
293+
state=state,
283294
)
284295
else:
285-
return _data_diff(dbt_project_dir=project_dir_override,
286-
dbt_profiles_dir=profiles_dir_override,
287-
**kw)
296+
return _data_diff(
297+
dbt_project_dir=project_dir_override, dbt_profiles_dir=profiles_dir_override, state=state, **kw
298+
)
288299
except Exception as e:
289300
logging.error(e)
290301
if kw["debug"]:
@@ -325,6 +336,7 @@ def _data_diff(
325336
dbt_profiles_dir,
326337
dbt_project_dir,
327338
select,
339+
state,
328340
threads1=None,
329341
threads2=None,
330342
__conf__=None,

data_diff/cloud/data_source.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ def _validate_temp_schema(temp_schema: str):
5252

5353

5454
def _get_temp_schema(dbt_parser: DbtParser, db_type: str) -> Optional[str]:
55-
diff_vars = dbt_parser.get_datadiff_variables()
56-
config_prod_database = diff_vars.get("prod_database")
57-
config_prod_schema = diff_vars.get("prod_schema")
55+
config = dbt_parser.get_datadiff_config()
56+
config_prod_database = config.prod_database
57+
config_prod_schema = config.prod_schema
5858
if config_prod_database is not None and config_prod_schema is not None:
5959
temp_schema = f"{config_prod_database}.{config_prod_schema}"
6060
if db_type == "snowflake":

data_diff/dbt.py

Lines changed: 110 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,24 @@
11
import json
22
import os
3+
import re
34
import time
45
import webbrowser
5-
from typing import List, Optional, Dict
6+
from typing import List, Optional, Dict, Tuple, Union
67
import keyring
7-
88
import pydantic
99
import rich
10-
from rich.prompt import Confirm
10+
from rich.prompt import Confirm, Prompt
11+
12+
from data_diff.errors import DataDiffCustomSchemaNoConfigError, DataDiffDbtProjectVarsNotFoundError
1113

1214
from . import connect_to_table, diff_tables, Algorithm
1315
from .cloud import DatafoldAPI, TCloudApiDataDiff, TCloudApiOrgMeta, get_or_create_data_source
14-
from .dbt_parser import DbtParser, PROJECT_FILE
16+
from .dbt_parser import DbtParser, PROJECT_FILE, TDatadiffConfig
1517
from .diff_tables import DiffResultWrapper
1618
from .format import jsonify
1719
from .tracking import (
20+
bool_ask_for_email,
21+
create_email_signup_event_json,
1822
set_entrypoint_name,
1923
set_dbt_user_id,
2024
set_dbt_version,
@@ -56,24 +60,21 @@ def dbt_diff(
5660
is_cloud: bool = False,
5761
dbt_selection: Optional[str] = None,
5862
json_output: bool = False,
63+
state: Optional[str] = None,
5964
) -> None:
6065
print_version_info()
6166
diff_threads = []
6267
set_entrypoint_name("CLI-dbt")
63-
dbt_parser = DbtParser(profiles_dir_override, project_dir_override)
68+
dbt_parser = DbtParser(profiles_dir_override, project_dir_override, state)
6469
models = dbt_parser.get_models(dbt_selection)
65-
datadiff_variables = dbt_parser.get_datadiff_variables()
66-
config_prod_database = datadiff_variables.get("prod_database")
67-
config_prod_schema = datadiff_variables.get("prod_schema")
68-
config_prod_custom_schema = datadiff_variables.get("prod_custom_schema")
69-
datasource_id = datadiff_variables.get("datasource_id")
70-
set_dbt_user_id(dbt_parser.dbt_user_id)
71-
set_dbt_version(dbt_parser.dbt_version)
72-
set_dbt_project_id(dbt_parser.dbt_project_id)
73-
74-
if datadiff_variables.get("custom_schemas") is not None:
75-
logger.warning(
76-
"vars: data_diff: custom_schemas: is no longer used and can be removed.\nTo utilize custom schemas, see the documentation here: https://docs.datafold.com/development_testing/open_source"
70+
config = dbt_parser.get_datadiff_config()
71+
_initialize_events(dbt_parser.dbt_user_id, dbt_parser.dbt_version, dbt_parser.dbt_project_id)
72+
73+
74+
if not state and not (config.prod_database or config.prod_schema):
75+
doc_url = "https://docs.datafold.com/development_testing/open_source#configure-your-dbt-project"
76+
raise DataDiffDbtProjectVarsNotFoundError(
77+
f"""vars: data_diff: section not found in dbt_project.yml.\n\nTo solve this, please configure your dbt project: \n{doc_url}\n\nOr specify a production manifest using the `--state` flag."""
7778
)
7879

7980
if is_cloud:
@@ -83,13 +84,13 @@ def dbt_diff(
8384
return
8485
org_meta = api.get_org_meta()
8586

86-
if datasource_id is None:
87+
if config.datasource_id is None:
8788
rich.print("[red]Data source ID not found in dbt_project.yml")
8889
is_create_data_source = Confirm.ask("Would you like to create a new data source?")
8990
if is_create_data_source:
90-
datasource_id = get_or_create_data_source(api=api, dbt_parser=dbt_parser)
91+
config.datasource_id = get_or_create_data_source(api=api, dbt_parser=dbt_parser)
9192
rich.print(f'To use the data source in next runs, please, update your "{PROJECT_FILE}" with a block:')
92-
rich.print(f"[green]vars:\n data_diff:\n datasource_id: {datasource_id}\n")
93+
rich.print(f"[green]vars:\n data_diff:\n datasource_id: {config.datasource_id}\n")
9394
rich.print(
9495
"Read more about Datafold vars in docs: "
9596
"https://docs.datafold.com/os_diff/dbt_integration/#configure-a-data-source\n"
@@ -100,21 +101,29 @@ def dbt_diff(
100101
"\nvars:\n data_diff:\n datasource_id: 1234"
101102
)
102103

103-
data_source = api.get_data_source(datasource_id)
104+
data_source = api.get_data_source(config.datasource_id)
104105
dbt_parser.set_casing_policy_for(connection_type=data_source.type)
105106
rich.print("[green][bold]\nDiffs in progress...[/][/]\n")
106107

107108
else:
108109
dbt_parser.set_connection()
109110

110111
for model in models:
111-
diff_vars = _get_diff_vars(
112-
dbt_parser, config_prod_database, config_prod_schema, config_prod_custom_schema, model
113-
)
112+
diff_vars = _get_diff_vars(dbt_parser, config, model)
113+
114+
# we won't always have a prod path when using state
115+
# when the model DNE in prod manifest, skip the model diff
116+
if (
117+
state and len(diff_vars.prod_path) < 2
118+
): # < 2 because some providers like databricks can legitimately have *only* 2
119+
diff_output_str = _diff_output_base(".".join(diff_vars.dev_path), ".".join(diff_vars.prod_path))
120+
diff_output_str += "[green]New model: nothing to diff![/] \n"
121+
rich.print(diff_output_str)
122+
continue
114123

115124
if diff_vars.primary_keys:
116125
if is_cloud:
117-
diff_thread = run_as_daemon(_cloud_diff, diff_vars, datasource_id, api, org_meta)
126+
diff_thread = run_as_daemon(_cloud_diff, diff_vars, config.datasource_id, api, org_meta)
118127
diff_threads.append(diff_thread)
119128
else:
120129
_local_diff(diff_vars, json_output)
@@ -132,41 +141,19 @@ def dbt_diff(
132141

133142
def _get_diff_vars(
134143
dbt_parser: "DbtParser",
135-
config_prod_database: Optional[str],
136-
config_prod_schema: Optional[str],
137-
config_prod_custom_schema: Optional[str],
144+
config: TDatadiffConfig,
138145
model,
139146
) -> TDiffVars:
140147
dev_database = model.database
141148
dev_schema = model.schema_
142149

143150
primary_keys = dbt_parser.get_pk_from_model(model, dbt_parser.unique_columns, "primary-key")
144151

145-
# "custom" dbt config database
146-
if model.config.database:
147-
prod_database = model.config.database
148-
elif config_prod_database:
149-
prod_database = config_prod_database
150-
else:
151-
prod_database = dev_database
152-
153-
# prod schema name differs from dev schema name
154-
if config_prod_schema:
155-
custom_schema = model.config.schema_
156-
157-
# the model has a custom schema config(schema='some_schema')
158-
if custom_schema:
159-
if not config_prod_custom_schema:
160-
raise ValueError(
161-
f"Found a custom schema on model {model.name}, but no value for\nvars:\n data_diff:\n prod_custom_schema:\nPlease set a value!\n"
162-
+ "For more details see: https://docs.datafold.com/development_testing/open_source"
163-
)
164-
prod_schema = config_prod_custom_schema.replace("<custom_schema>", custom_schema)
165-
# no custom schema, use the default
166-
else:
167-
prod_schema = config_prod_schema
152+
# prod path is constructed via configuration or the prod manifest via --state
153+
if dbt_parser.prod_manifest_obj:
154+
prod_database, prod_schema = _get_prod_path_from_manifest(model, dbt_parser.prod_manifest_obj)
168155
else:
169-
prod_schema = dev_schema
156+
prod_database, prod_schema = _get_prod_path_from_config(config, model, dev_database, dev_schema)
170157

171158
if dbt_parser.requires_upper:
172159
dev_qualified_list = [x.upper() for x in [dev_database, dev_schema, model.alias] if x]
@@ -190,6 +177,46 @@ def _get_diff_vars(
190177
)
191178

192179

180+
181+
def _get_prod_path_from_config(config, model, dev_database, dev_schema) -> Tuple[str, str]:
182+
# "custom" dbt config database
183+
if model.config.database:
184+
prod_database = model.config.database
185+
elif config.prod_database:
186+
prod_database = config.prod_database
187+
else:
188+
prod_database = dev_database
189+
190+
# prod schema name differs from dev schema name
191+
if config.prod_schema:
192+
custom_schema = model.config.schema_
193+
194+
# the model has a custom schema config(schema='some_schema')
195+
if custom_schema:
196+
if not config.prod_custom_schema:
197+
raise DataDiffCustomSchemaNoConfigError(
198+
f"Found a custom schema on model {model.name}, but no value for\nvars:\n data_diff:\n prod_custom_schema:\nPlease set a value or utilize the `--state` flag!\n\n"
199+
+ "For more details see: https://docs.datafold.com/development_testing/open_source"
200+
)
201+
prod_schema = config.prod_custom_schema.replace("<custom_schema>", custom_schema)
202+
# no custom schema, use the default
203+
else:
204+
prod_schema = config.prod_schema
205+
else:
206+
prod_schema = dev_schema
207+
return prod_database, prod_schema
208+
209+
210+
def _get_prod_path_from_manifest(model, prod_manifest) -> Union[Tuple[str, str], Tuple[None, None]]:
211+
prod_database = None
212+
prod_schema = None
213+
prod_model = prod_manifest.nodes.get(model.unique_id, None)
214+
if prod_model:
215+
prod_database = prod_model.database
216+
prod_schema = prod_model.schema_
217+
return prod_database, prod_schema
218+
219+
193220
def _local_diff(diff_vars: TDiffVars, json_output: bool = False) -> None:
194221
dev_qualified_str = ".".join(diff_vars.dev_path)
195222
prod_qualified_str = ".".join(diff_vars.prod_path)
@@ -403,3 +430,34 @@ def _cloud_diff(diff_vars: TDiffVars, datasource_id: int, api: DatafoldAPI, org_
403430

404431
def _diff_output_base(dev_path: str, prod_path: str) -> str:
405432
return f"\n[green]{prod_path} <> {dev_path}[/] \n"
433+
434+
435+
def _initialize_events(dbt_user_id: Optional[str], dbt_version: Optional[str], dbt_project_id: Optional[str]) -> None:
436+
set_dbt_user_id(dbt_user_id)
437+
set_dbt_version(dbt_version)
438+
set_dbt_project_id(dbt_project_id)
439+
_email_signup()
440+
441+
442+
def _email_signup() -> None:
443+
email_regex = r'^[\w\.\+-]+@[\w\.-]+\.\w+$'
444+
prompt = "\nWould you like to be notified when a new data-diff version is available?\n\nEnter email or leave blank to opt out (we'll only ask once).\n"
445+
446+
if bool_ask_for_email():
447+
while True:
448+
email_input = Prompt.ask(
449+
prompt=prompt,
450+
default="",
451+
show_default=False,
452+
)
453+
email = email_input.strip()
454+
455+
if email == "" or re.match(email_regex, email):
456+
break
457+
458+
prompt = ""
459+
rich.print("[red]Invalid email. Please enter a valid email or leave it blank to opt out.[/]")
460+
461+
if email:
462+
event_json = create_email_signup_event_json(email)
463+
run_as_daemon(send_event_json, event_json)

0 commit comments

Comments
 (0)