Feat(dlt): Add support to override the dlt pipelines directory (#3984)

themisvaltinos · web-flow · commit 8ccf84bb682b · 2025-03-12T18:54:14.000-07:00
diff --git a/docs/integrations/dlt.md b/docs/integrations/dlt.md
@@ -28,6 +28,14 @@ This will create the configuration file and directories, which are found in all
 
 SQLMesh will also automatically generate models to ingest data from the pipeline incrementally. Incremental loading is ideal for large datasets where recomputing entire tables is resource-intensive. In this case utilizing the [`INCREMENTAL_BY_TIME_RANGE` model kind](../concepts/models/model_kinds.md#incremental_by_time_range). However, these model definitions can be customized to meet your specific project needs.
 
+#### Specify the path to the pipelines directory
+
+The default location for dlt pipelines is `~/.dlt/pipelines/<pipeline_name>`. If your pipelines are in a [different directory](https://dlthub.com/docs/general-usage/pipeline#separate-working-environments-with-pipelines_dir), use the `--dlt-path` argument to specify the path explicitly:
+
+```bash
+$ sqlmesh init -t dlt --dlt-pipeline <pipeline-name> --dlt-path <pipelines-directory> dialect
+```
+
 ### Generating models on demand
 
 To update the models in your SQLMesh project on demand, use the `dlt_refresh` command. This allows you to either specify individual tables to generate incremental models from or update all models at once.
@@ -50,6 +58,12 @@ $ sqlmesh dlt_refresh <pipeline-name> --force
 $ sqlmesh dlt_refresh <pipeline-name> --table <dlt-table>
 ```
 
+- **Provide the explicit path to the pipelines directory** (using `--dlt-path`):
+
+```bash
+$ sqlmesh dlt_refresh <pipeline-name> --dlt-path <pipelines-directory>
+```
+
 #### Configuration
 
 SQLMesh will retrieve the data warehouse connection credentials from your dlt project to configure the `config.yaml` file. This configuration can be modified or customized as needed. For more details, refer to the [configuration guide](../guides/configuration.md).
diff --git a/sqlmesh/cli/example_project.py b/sqlmesh/cli/example_project.py
@@ -252,6 +252,7 @@ def init_example_project(
     dialect: t.Optional[str],
     template: ProjectTemplate = ProjectTemplate.DEFAULT,
     pipeline: t.Optional[str] = None,
+    dlt_path: t.Optional[str] = None,
     schema_name: str = "sqlmesh_example",
 ) -> None:
     root_path = Path(path)
@@ -276,7 +277,9 @@ def init_example_project(
     start = None
     if template == ProjectTemplate.DLT:
         if pipeline and dialect:
-            models, settings, start = generate_dlt_models_and_settings(pipeline, dialect)
+            models, settings, start = generate_dlt_models_and_settings(
+                pipeline_name=pipeline, dialect=dialect, dlt_path=dlt_path
+            )
         else:
             raise click.ClickException(
                 "DLT pipeline is a required argument to generate a SQLMesh project from DLT"
diff --git a/sqlmesh/cli/main.py b/sqlmesh/cli/main.py
@@ -135,6 +135,11 @@ def cli(
     type=str,
     help="DLT pipeline for which to generate a SQLMesh project. Use alongside template: dlt",
 )
+@click.option(
+    "--dlt-path",
+    type=str,
+    help="The directory where the DLT pipeline resides. Use alongside template: dlt",
+)
 @click.pass_context
 @error_handler
 @cli_analytics
@@ -143,14 +148,19 @@ def init(
     sql_dialect: t.Optional[str] = None,
     template: t.Optional[str] = None,
     dlt_pipeline: t.Optional[str] = None,
+    dlt_path: t.Optional[str] = None,
 ) -> None:
     """Create a new SQLMesh repository."""
     try:
         project_template = ProjectTemplate(template.lower() if template else "default")
     except ValueError:
         raise click.ClickException(f"Invalid project template '{template}'")
     init_example_project(
-        ctx.obj, dialect=sql_dialect, template=project_template, pipeline=dlt_pipeline
+        ctx.obj,
+        dialect=sql_dialect,
+        template=project_template,
+        pipeline=dlt_pipeline,
+        dlt_path=dlt_path,
     )
 
 
@@ -955,6 +965,11 @@ def table_name(obj: Context, model_name: str, dev: bool) -> None:
     default=False,
     help="If set, existing models are overwritten with the new DLT tables.",
 )
+@click.option(
+    "--dlt-path",
+    type=str,
+    help="The directory where the DLT pipeline resides.",
+)
 @click.pass_context
 @error_handler
 @cli_analytics
@@ -963,11 +978,12 @@ def dlt_refresh(
     pipeline: str,
     force: bool,
     table: t.List[str] = [],
+    dlt_path: t.Optional[str] = None,
 ) -> None:
     """Attaches to a DLT pipeline with the option to update specific or all missing tables in the SQLMesh project."""
     from sqlmesh.integrations.dlt import generate_dlt_models
 
-    sqlmesh_models = generate_dlt_models(ctx.obj, pipeline, list(table or []), force)
+    sqlmesh_models = generate_dlt_models(ctx.obj, pipeline, list(table or []), force, dlt_path)
     if sqlmesh_models:
         model_names = "\n".join([f"- {model_name}" for model_name in sqlmesh_models])
         ctx.obj.console.log_success(f"Updated SQLMesh project with models:\n{model_names}")
diff --git a/sqlmesh/integrations/dlt.py b/sqlmesh/integrations/dlt.py
@@ -9,18 +9,31 @@
 
 
 def generate_dlt_models_and_settings(
-    pipeline_name: str, dialect: str, tables: t.Optional[t.List[str]] = None
+    pipeline_name: str,
+    dialect: str,
+    tables: t.Optional[t.List[str]] = None,
+    dlt_path: t.Optional[str] = None,
 ) -> t.Tuple[t.Set[t.Tuple[str, str]], str, str]:
-    """This function attaches to a DLT pipeline and retrieves the connection configs and
+    """
+    This function attaches to a DLT pipeline and retrieves the connection configs and
     SQLMesh models based on the tables present in the pipeline's default schema.
+
+    Args:
+        pipeline_name: The name of the DLT pipeline to attach to.
+        dialect: The SQL dialect to use for generating SQLMesh models.
+        tables: A list of table names to include.
+        dlt_path: The path to the directory containing the DLT pipelines.
+
+    Returns:
+        A tuple containing a set of the SQLMesh model definitions, the connection config and the start date.
     """
 
     import dlt
     from dlt.common.schema.utils import has_table_seen_data, is_complete_column
     from dlt.pipeline.exceptions import CannotRestorePipelineException
 
     try:
-        pipeline = dlt.attach(pipeline_name=pipeline_name)
+        pipeline = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=dlt_path or "")
     except CannotRestorePipelineException:
         raise click.ClickException(f"Could not attach to pipeline {pipeline_name}")
 
@@ -108,14 +121,19 @@ def generate_dlt_models_and_settings(
 
 
 def generate_dlt_models(
-    context: Context, pipeline_name: str, tables: t.List[str], force: bool
+    context: Context,
+    pipeline_name: str,
+    tables: t.List[str],
+    force: bool,
+    dlt_path: t.Optional[str] = None,
 ) -> t.List[str]:
     from sqlmesh.cli.example_project import _create_models
 
     sqlmesh_models, _, _ = generate_dlt_models_and_settings(
         pipeline_name=pipeline_name,
         dialect=context.config.dialect or "",
         tables=tables if tables else None,
+        dlt_path=dlt_path,
     )
 
     if not tables and not force:
diff --git a/sqlmesh/magics.py b/sqlmesh/magics.py
@@ -165,6 +165,11 @@ def context(self, line: str) -> None:
         type=str,
         help="DLT pipeline for which to generate a SQLMesh project. Use alongside template: dlt",
     )
+    @argument(
+        "--dlt-path",
+        type=str,
+        help="The directory where the DLT pipeline resides. Use alongside template: dlt",
+    )
     @line_magic
     def init(self, line: str) -> None:
         """Creates a SQLMesh project scaffold with a default SQL dialect."""
@@ -175,7 +180,9 @@ def init(self, line: str) -> None:
             )
         except ValueError:
             raise MagicError(f"Invalid project template '{args.template}'")
-        init_example_project(args.path, args.sql_dialect, project_template, args.dlt_pipeline)
+        init_example_project(
+            args.path, args.sql_dialect, project_template, args.dlt_pipeline, args.dlt_path
+        )
         html = str(
             h(
                 "div",
@@ -741,6 +748,11 @@ def table_name(self, context: Context, line: str) -> None:
         action="store_true",
         help="If set, existing models are overwritten with the new DLT tables.",
     )
+    @argument(
+        "--dlt-path",
+        type=str,
+        help="The directory where the DLT pipeline resides.",
+    )
     @line_magic
     @pass_sqlmesh_context
     def dlt_refresh(self, context: Context, line: str) -> None:
@@ -749,7 +761,7 @@ def dlt_refresh(self, context: Context, line: str) -> None:
 
         args = parse_argstring(self.dlt_refresh, line)
         sqlmesh_models = generate_dlt_models(
-            context, args.pipeline, list(args.table or []), args.force
+            context, args.pipeline, list(args.table or []), args.force, args.dlt_path
         )
         if sqlmesh_models:
             model_names = "\n".join([f"- {model_name}" for model_name in sqlmesh_models])
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
@@ -2,6 +2,7 @@
 from contextlib import contextmanager
 from os import getcwd, path, remove
 from pathlib import Path
+from click import ClickException
 import pytest
 from click.testing import CliRunner
 import time_machine
@@ -778,6 +779,8 @@ def test_dlt_pipeline_errors(runner, tmp_path):
 
 @time_machine.travel(FREEZE_TIME)
 def test_plan_dlt(runner, tmp_path):
+    from dlt.common.pipeline import get_dlt_pipelines_dir
+
     root_dir = path.abspath(getcwd())
     pipeline_path = root_dir + "/examples/sushi_dlt/sushi_pipeline.py"
     dataset_path = root_dir + "/sushi.duckdb"
@@ -788,7 +791,15 @@ def test_plan_dlt(runner, tmp_path):
     with open(pipeline_path) as file:
         exec(file.read())
 
-    init_example_project(tmp_path, "duckdb", ProjectTemplate.DLT, "sushi")
+    # This should fail since it won't be able to locate the pipeline in this path
+    with pytest.raises(ClickException, match=r".*Could not attach to pipeline*"):
+        init_example_project(
+            tmp_path, "duckdb", ProjectTemplate.DLT, "sushi", dlt_path="./dlt2/pipelines"
+        )
+
+    # By setting the pipelines path where the pipeline directory is located, it should work
+    dlt_path = get_dlt_pipelines_dir()
+    init_example_project(tmp_path, "duckdb", ProjectTemplate.DLT, "sushi", dlt_path=dlt_path)
 
     expected_config = f"""gateways:
   duckdb:
@@ -925,8 +936,9 @@ def test_plan_dlt(runner, tmp_path):
         remove(dlt_sushi_fillings_model_path)
         remove(dlt_sushi_twice_nested_model_path)
 
-        # Update to generate a specific model: sushi_types
-        assert generate_dlt_models(context, "sushi", ["sushi_types"], False) == [
+        # Update to generate a specific model: sushi_types.
+        # Also validate using the dlt_path that the pipelines are located.
+        assert generate_dlt_models(context, "sushi", ["sushi_types"], False, dlt_path) == [
             "sushi_dataset_sqlmesh.incremental_sushi_types"
         ]
 
@@ -972,6 +984,7 @@ def test_init_project_dialects(tmp_path):
             remove(tmp_path / "config.yaml")
 
 
+@time_machine.travel(FREEZE_TIME)
 def test_environments(runner, tmp_path):
     create_example_project(tmp_path)
     ttl = time_like_to_str(to_datetime(now_ds()) + timedelta(days=7))