Widen parent start date override if it ends up being later than a child start date override

erindru · erindru · commit ad5466b0c487 · 2025-07-02T23:25:44.000Z
diff --git a/sqlmesh/core/plan/builder.py b/sqlmesh/core/plan/builder.py
@@ -282,6 +282,7 @@ def build(self) -> Plan:
         self._check_destructive_changes(directly_modified)
         self._categorize_snapshots(dag, indirectly_modified)
         self._adjust_new_snapshot_intervals()
+        self._adjust_start_overrides(dag)
 
         deployability_index = (
             DeployabilityIndex.create(
@@ -524,6 +525,42 @@ def _adjust_new_snapshot_intervals(self) -> None:
             if new.is_forward_only:
                 new.dev_intervals = new.intervals.copy()
 
+    def _adjust_start_overrides(self, dag: DAG[SnapshotId]) -> None:
+        if not self._start_override_per_model:
+            return
+
+        start_override_by_snapshot_id = {
+            self._context_diff.snapshots_by_name[name].snapshot_id: start_date
+            for name, start_date in self._start_override_per_model.items()
+        }
+
+        for current_snapshot_id in dag:
+            # we only care about adjusting the start date for incremental models
+            current_snapshot = self._context_diff.snapshots[current_snapshot_id]
+            if not current_snapshot.is_incremental:
+                continue
+
+            earliest_downstream_start_date_override = min(
+                (
+                    start_override_by_snapshot_id[downstream_sid]
+                    for downstream_sid in dag.downstream(current_snapshot_id)
+                    if downstream_sid in start_override_by_snapshot_id
+                ),
+                default=None,
+            )
+
+            if earliest_downstream_start_date_override:
+                current_start_date_override = start_override_by_snapshot_id.get(current_snapshot_id)
+
+                # if any of our downstream snapshots have a start date override earlier than us, we need to widen ourselves to include it
+                # otherwise, the downstream snapshots will only get a subset of the data they need
+                if not current_start_date_override or (
+                    earliest_downstream_start_date_override < current_start_date_override
+                ):
+                    self._start_override_per_model[current_snapshot.name] = (
+                        earliest_downstream_start_date_override
+                    )
+
     def _check_destructive_changes(self, directly_modified: t.Set[SnapshotId]) -> None:
         for s_id in sorted(directly_modified):
             snapshot = self._context_diff.snapshots[s_id]
diff --git a/tests/core/test_context.py b/tests/core/test_context.py
@@ -2477,3 +2477,140 @@ def _get_missing_intervals(plan: Plan, name: str) -> t.List[t.Tuple[datetime, da
     ) == [
         (to_datetime("2020-01-18 00:00:00"), to_datetime("2020-01-18 23:59:59.999999")),
     ]
+
+
+def test_plan_min_intervals_adjusted_for_downstream(tmp_path: Path):
+    """
+    Scenario:
+        A(hourly) <- B(daily) <- C(weekly)
+        D(hourly)
+
+    We need to ensure that :min_intervals covers at least :min_intervals of all downstream models for the dag to be valid
+    In this scenario, if min_intervals=1:
+        - A would need to cover at least 168 hours (7 days * 24 hours) because its downstream model C is weekly
+        - B would need to cover at least 7 days because its downstream model C is weekly
+        - C would need to cover at least 1 week because min_intervals: 1
+        - D is unrelated to A, B and C so would only need to cover 1 hour to satisfy min_intervals: 1
+    """
+
+    init_example_project(tmp_path, engine_type="duckdb", dialect="duckdb")
+
+    context = Context(
+        paths=tmp_path, config=Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
+    )
+
+    current_time = to_datetime("2020-02-01 00:00:01")
+
+    # initial state of example project
+    context.plan(auto_apply=True, execution_time=current_time)
+
+    (tmp_path / "models" / "hourly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.hourly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt,
+        batch_size 1
+      ),
+      start '2020-01-01',
+      cron '@hourly'
+    );                        
+
+    select @start_dt as start_dt, @end_dt as end_dt;
+    """)
+
+    (tmp_path / "models" / "unrelated_hourly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.unrelated_hourly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt        
+      ),
+      start '2020-01-01',
+      cron '@hourly'
+    );                        
+
+    select @start_dt as start_dt, @end_dt as end_dt;
+    """)
+
+    (tmp_path / "models" / "daily_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.daily_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt
+      ),
+      start '2020-01-01',
+      cron '@daily'
+    );                        
+
+    select start_dt, end_dt from sqlmesh_example.hourly_model where start_dt between @start_dt and @end_dt;
+    """)
+
+    (tmp_path / "models" / "weekly_model.sql").write_text("""
+    MODEL (
+      name sqlmesh_example.weekly_model,
+      kind INCREMENTAL_BY_TIME_RANGE (
+        time_column start_dt
+      ),
+      start '2020-01-01',
+      cron '@weekly'
+    );                        
+
+    select start_dt, end_dt from sqlmesh_example.daily_model where start_dt between @start_dt and @end_dt;
+    """)
+
+    context.load()
+
+    # create a dev env for "1 day ago" with min_intervals=1
+    # this should force a weeks worth of intervals for every model
+    plan = context.plan(
+        environment="pr_env",
+        start="1 day ago",
+        execution_time=current_time,
+        min_intervals=1,
+    )
+
+    def _get_missing_intervals(name: str) -> t.List[t.Tuple[datetime, datetime]]:
+        snapshot_id = context.get_snapshot(name, raise_if_missing=True).snapshot_id
+        snapshot_intervals = next(
+            si for si in plan.missing_intervals if si.snapshot_id == snapshot_id
+        )
+        return [(to_datetime(s), to_datetime(e)) for s, e in snapshot_intervals.merged_intervals]
+
+    # We only operate on completed intervals, so given the current_time this is the range of the last completed week
+    _get_missing_intervals("sqlmesh_example.weekly_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-26 00:00:00"))
+    ]
+
+    # The daily model needs to cover the week, so it gets its start date moved back to line up
+    _get_missing_intervals("sqlmesh_example.daily_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # The hourly model needs to cover both the daily model and the weekly model, so it also gets its start date moved back to line up with the weekly model
+    assert _get_missing_intervals("sqlmesh_example.hourly_model") == [
+        (to_datetime("2020-01-19 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # The unrelated model has no upstream constraints, so its start date doesnt get moved to line up with the weekly model
+    # However it still gets backfilled for 24 hours because the plan start is 1 day and this satisfies min_intervals: 1
+    _get_missing_intervals("sqlmesh_example.unrelated_hourly_model") == [
+        (to_datetime("2020-01-31 00:00:00"), to_datetime("2020-02-01 00:00:00"))
+    ]
+
+    # Check that actually running the plan produces the correct result, since missing intervals are re-calculated in the evaluator
+    context.apply(plan)
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.weekly_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-25 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.daily_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.hourly_model"
+    ) == [(to_datetime("2020-01-19 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]
+
+    assert context.engine_adapter.fetchall(
+        "select min(start_dt), max(end_dt) from sqlmesh_example__pr_env.unrelated_hourly_model"
+    ) == [(to_datetime("2020-01-31 00:00:00"), to_datetime("2020-01-31 23:59:59.999999"))]