Chore: Refactor the state stream interface (#4125)

erindru · web-flow · commit d1c34ce9fb80 · 2025-04-15T10:40:44.000+12:00
diff --git a/sqlmesh/core/console.py b/sqlmesh/core/console.py
@@ -1965,12 +1965,16 @@ def print_environments(self, environments_summary: t.Dict[str, int]) -> None:
         self.log_status_update(f"Number of SQLMesh environments are: {output_str}")
 
     def print_connection_config(self, config: ConnectionConfig, title: str = "Connection") -> None:
-        engine_adapter_type = config._engine_adapter
-
         tree = Tree(f"[b]{title}:[/b]")
         tree.add(f"Type: [bold cyan]{config.type_}[/bold cyan]")
         tree.add(f"Catalog: [bold cyan]{config.get_catalog()}[/bold cyan]")
-        tree.add(f"Dialect: [bold cyan]{engine_adapter_type.DIALECT}[/bold cyan]")
+
+        try:
+            engine_adapter_type = config._engine_adapter
+            tree.add(f"Dialect: [bold cyan]{engine_adapter_type.DIALECT}[/bold cyan]")
+        except NotImplementedError:
+            # not all ConnectionConfig's have an engine adapter associated. The CloudConnectionConfig has a HTTP client instead
+            pass
 
         self._print(tree)
 
diff --git a/sqlmesh/core/state_sync/common.py b/sqlmesh/core/state_sync/common.py
@@ -6,6 +6,8 @@
 import itertools
 import abc
 
+from dataclasses import dataclass
+
 from sqlmesh.core.console import Console
 from sqlmesh.core.dialect import schema_
 from sqlmesh.utils.pydantic import PydanticModel
@@ -119,23 +121,62 @@ class EnvironmentWithStatements(PydanticModel):
     statements: t.List[EnvironmentStatements] = []
 
 
+@dataclass
+class VersionsChunk:
+    versions: Versions
+
+
+class SnapshotsChunk:
+    def __init__(self, items: t.Iterator[Snapshot]):
+        self.items = items
+
+    def __iter__(self) -> t.Iterator[Snapshot]:
+        return self.items
+
+
+class EnvironmentsChunk:
+    def __init__(self, items: t.Iterator[EnvironmentWithStatements]):
+        self.items = items
+
+    def __iter__(self) -> t.Iterator[EnvironmentWithStatements]:
+        return self.items
+
+
+StateStreamContents = t.Union[VersionsChunk, SnapshotsChunk, EnvironmentsChunk]
+
+
 class StateStream(abc.ABC):
     """
     Represents a stream of state either going into the StateSync (perhaps loaded from a file)
     or out of the StateSync (perhaps being dumped to a file)
+
+    Iterating over the stream produces the following chunks:
+
+        VersionsChunk: The versions of the objects contained in this StateStream
+        SnapshotsChunk: Is itself an iterator that streams Snapshot objects. Note that they should be fully populated with any relevant Intervals
+        EnvironmentsChunk: Is itself an iterator emitting a stream of Environments with any EnvironmentStatements attached
+
+    The idea here is to give some structure to the stream and ensure that callers have the opportunity to process all its components while not
+    needing to worry about the order they are emitted in
     """
 
-    @property
     @abc.abstractmethod
-    def versions(self) -> Versions:
-        """The versions of the objects contained in this StateStream"""
+    def __iter__(self) -> t.Iterator[StateStreamContents]:
+        pass
 
-    @property
-    @abc.abstractmethod
-    def snapshots(self) -> t.Iterable[Snapshot]:
-        """A stream of Snapshot objects. Note that they should be fully populated with any relevant Intervals"""
+    @classmethod
+    def from_iterators(
+        cls: t.Type["StateStream"],
+        versions: Versions,
+        snapshots: t.Iterator[Snapshot],
+        environments: t.Iterator[EnvironmentWithStatements],
+    ) -> "StateStream":
+        class _StateStream(cls):  # type: ignore
+            def __iter__(self) -> t.Iterator[StateStreamContents]:
+                yield VersionsChunk(versions)
 
-    @property
-    @abc.abstractmethod
-    def environments(self) -> t.Iterable[EnvironmentWithStatements]:
-        """A stream of Environments with any EnvironmentStatements attached"""
+                yield SnapshotsChunk(snapshots)
+
+                yield EnvironmentsChunk(environments)
+
+        return _StateStream()
diff --git a/sqlmesh/core/state_sync/db/facade.py b/sqlmesh/core/state_sync/db/facade.py
@@ -19,7 +19,6 @@
 import contextlib
 import logging
 import typing as t
-import itertools
 from pathlib import Path
 from datetime import datetime
 
@@ -48,6 +47,9 @@
     Versions,
 )
 from sqlmesh.core.state_sync.common import (
+    EnvironmentsChunk,
+    SnapshotsChunk,
+    VersionsChunk,
     transactional,
     StateStream,
     chunk_iterable,
@@ -448,7 +450,9 @@ def rollback(self) -> None:
 
     @transactional()
     def export(self, environment_names: t.Optional[t.List[str]] = None) -> StateStream:
-        state_sync = self
+        versions = self.get_versions(
+            validate=True
+        )  # will throw if the state db hasnt been created or there is a version mismatch
 
         snapshot_ids_to_export: t.Set[SnapshotId] = set()
         selected_environments: t.List[Environment] = []
@@ -458,89 +462,84 @@ def export(self, environment_names: t.Optional[t.List[str]] = None) -> StateStre
                 if not environment:
                     raise SQLMeshError(f"No such environment: {env_name}")
                 selected_environments.append(environment)
+        else:
+            selected_environments = self.get_environments()
 
-            for env in selected_environments:
-                snapshot_ids_to_export |= set([s.snapshot_id for s in env.snapshots or []])
-
-        def _include_snapshot(s_id: SnapshotId) -> bool:
-            if environment_names:
-                return s_id in snapshot_ids_to_export
-            return True
-
-        class _DumpStateStream(StateStream):
-            @property
-            def versions(self) -> Versions:
-                return state_sync.get_versions()
-
-            @property
-            def snapshots(self) -> t.Iterable[Snapshot]:
-                all_snapshot_ids = {
-                    s.snapshot_id
-                    for e in state_sync.get_environments()
-                    for s in e.snapshots
-                    if _include_snapshot(s.snapshot_id)
-                }
-                for chunk in chunk_iterable(all_snapshot_ids, SnapshotState.SNAPSHOT_BATCH_SIZE):
-                    yield from state_sync.get_snapshots(chunk).values()
+        for env in selected_environments:
+            snapshot_ids_to_export |= set([s.snapshot_id for s in env.snapshots or []])
 
-            @property
-            def environments(self) -> t.Iterable[EnvironmentWithStatements]:
-                envs = selected_environments if environment_names else state_sync.get_environments()
+        def _export_snapshots() -> t.Iterator[Snapshot]:
+            for chunk in chunk_iterable(snapshot_ids_to_export, SnapshotState.SNAPSHOT_BATCH_SIZE):
+                yield from self.get_snapshots(chunk).values()
 
-                for env in envs:
-                    yield EnvironmentWithStatements(
-                        environment=env, statements=state_sync.get_environment_statements(env.name)
-                    )
+        def _export_environments() -> t.Iterator[EnvironmentWithStatements]:
+            for env in selected_environments:
+                yield EnvironmentWithStatements(
+                    environment=env, statements=self.get_environment_statements(env.name)
+                )
 
-        return _DumpStateStream()
+        return StateStream.from_iterators(
+            versions=versions,
+            snapshots=_export_snapshots(),
+            environments=_export_environments(),
+        )
 
     @transactional()
     def import_(self, stream: StateStream, clear: bool = True) -> None:
         existing_versions = self.get_versions()
 
-        # SQLMesh major/minor version must match so that we can be sure the JSON contained in the state file
-        # is compatible with our Pydantic model definitions. Patch versions dont need to match because the assumption
-        # is that they dont contain any breaking changes
-        incoming_versions = stream.versions
-        if incoming_versions.minor_sqlmesh_version != existing_versions.minor_sqlmesh_version:
-            raise SQLMeshError(
-                f"SQLMesh version mismatch. You are running '{existing_versions.sqlmesh_version}' but the state file was created with '{incoming_versions.sqlmesh_version}'.\n"
-                "Please upgrade/downgrade your SQLMesh version to match the state file before performing the import."
-            )
-
-        if clear:
-            self.reset(default_catalog=None)
-
-        auto_restatements: t.Dict[SnapshotNameVersion, t.Optional[int]] = {}
-
-        for snapshot_chunk in chunk_iterable(stream.snapshots, SnapshotState.SNAPSHOT_BATCH_SIZE):
-            snapshot_iterator, intervals_iterator, auto_restatments_iterator = itertools.tee(
-                snapshot_chunk, 3
-            )
-            overwrite_existing_snapshots = (
-                not clear
-            )  # if clear=True, all existing snapshots were dropped anyway
-            self.snapshot_state.push_snapshots(
-                snapshot_iterator, overwrite=overwrite_existing_snapshots
-            )
-            self.add_snapshots_intervals((s.snapshot_intervals for s in intervals_iterator))
+        for state_chunk in stream:
+            if isinstance(state_chunk, VersionsChunk):
+                # SQLMesh major/minor version must match so that we can be sure the JSON contained in the state file
+                # is compatible with our Pydantic model definitions. Patch versions dont need to match because the assumption
+                # is that they dont contain any breaking changes
+                incoming_versions = state_chunk.versions
+                if (
+                    incoming_versions.minor_sqlmesh_version
+                    != existing_versions.minor_sqlmesh_version
+                ):
+                    raise SQLMeshError(
+                        f"SQLMesh version mismatch. You are running '{existing_versions.sqlmesh_version}' but the state file was created with '{incoming_versions.sqlmesh_version}'.\n"
+                        "Please upgrade/downgrade your SQLMesh version to match the state file before performing the import."
+                    )
 
-            auto_restatements.update(
-                {
-                    s.name_version: s.next_auto_restatement_ts
-                    for s in auto_restatments_iterator
-                    if s.next_auto_restatement_ts
-                }
-            )
+                if clear:
+                    self.reset(default_catalog=None)
+
+            if isinstance(state_chunk, SnapshotsChunk):
+                auto_restatements: t.Dict[SnapshotNameVersion, t.Optional[int]] = {}
+
+                for snapshot_chunk in chunk_iterable(
+                    state_chunk, SnapshotState.SNAPSHOT_BATCH_SIZE
+                ):
+                    snapshot_chunk = list(snapshot_chunk)
+                    overwrite_existing_snapshots = (
+                        not clear
+                    )  # if clear=True, all existing snapshots were dropped anyway
+                    self.snapshot_state.push_snapshots(
+                        snapshot_chunk, overwrite=overwrite_existing_snapshots
+                    )
+                    self.add_snapshots_intervals((s.snapshot_intervals for s in snapshot_chunk))
+
+                    auto_restatements.update(
+                        {
+                            s.name_version: s.next_auto_restatement_ts
+                            for s in snapshot_chunk
+                            if s.next_auto_restatement_ts
+                        }
+                    )
 
-        for environment_with_statements in stream.environments:
-            environment = environment_with_statements.environment
-            self.environment_state.update_environment(environment)
-            self.environment_state.update_environment_statements(
-                environment.name, environment.plan_id, environment_with_statements.statements
-            )
+                self.update_auto_restatements(auto_restatements)
 
-        self.update_auto_restatements(auto_restatements)
+            if isinstance(state_chunk, EnvironmentsChunk):
+                for environment_with_statements in state_chunk:
+                    environment = environment_with_statements.environment
+                    self.environment_state.update_environment(environment)
+                    self.environment_state.update_environment_statements(
+                        environment.name,
+                        environment.plan_id,
+                        environment_with_statements.statements,
+                    )
 
     def state_type(self) -> str:
         return self.engine_adapter.dialect
diff --git a/sqlmesh/core/state_sync/export_import.py b/sqlmesh/core/state_sync/export_import.py