Ensure correct AST nodes are created when reading from state as well

erindru · erindru · commit 9ea63b18b26b · 2025-04-25T06:05:52.000Z
diff --git a/sqlmesh/core/model/meta.py b/sqlmesh/core/model/meta.py
@@ -5,7 +5,7 @@
 from typing_extensions import Self
 
 from pydantic import Field
-from sqlglot import Dialect, exp
+from sqlglot import Dialect, exp, parse_one
 from sqlglot.helper import ensure_collection, ensure_list
 from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
 
@@ -182,6 +182,14 @@ def _gateway_validator(cls, v: t.Any) -> t.Optional[str]:
     def _partition_and_cluster_validator(
         cls, v: t.Any, info: ValidationInfo
     ) -> t.List[exp.Expression]:
+        if isinstance(v, list) and info.field_name == "partitioned_by_":
+            # this branch gets hit when we are deserializing from json because `partitioned_by` is stored as a List[str]
+            string_to_parse = (
+                f"({','.join(v)})"  # recreate the (a, b, c) part of "partitioned_by (a, b, c)"
+            )
+            parsed = parse_one(string_to_parse, into=exp.PartitionedByProperty)
+            v = parsed.this.expressions if isinstance(parsed.this, exp.Schema) else v
+
         expressions = list_of_fields_validator(v, info.data)
 
         for expression in expressions:
diff --git a/sqlmesh/migrations/v0081_update_partitioned_by.py b/sqlmesh/migrations/v0081_update_partitioned_by.py
@@ -0,0 +1,91 @@
+"""Remove superfluous exp.Paren references from partitioned_by"""
+
+import json
+
+import pandas as pd
+from sqlglot import exp
+
+from sqlmesh.utils.migration import index_text_type
+from sqlmesh.utils.migration import blob_text_type
+
+
+def migrate(state_sync, **kwargs):  # type: ignore
+    engine_adapter = state_sync.engine_adapter
+    schema = state_sync.schema
+    snapshots_table = "_snapshots"
+    index_type = index_text_type(engine_adapter.dialect)
+    if schema:
+        snapshots_table = f"{schema}.{snapshots_table}"
+
+    new_snapshots = []
+    updated = False
+
+    for (
+        name,
+        identifier,
+        version,
+        snapshot,
+        kind_name,
+        updated_ts,
+        unpaused_ts,
+        ttl_ms,
+        unrestorable,
+    ) in engine_adapter.fetchall(
+        exp.select(
+            "name",
+            "identifier",
+            "version",
+            "snapshot",
+            "kind_name",
+            "updated_ts",
+            "unpaused_ts",
+            "ttl_ms",
+            "unrestorable",
+        ).from_(snapshots_table),
+        quote_identifiers=True,
+    ):
+        parsed_snapshot = json.loads(snapshot)
+
+        if partitioned_by := parsed_snapshot["node"].get("partitioned_by"):
+            new_partitioned_by = []
+            for item in partitioned_by:
+                # rewrite '(foo)' to 'foo'
+                if item.startswith("(") and item.endswith(")"):
+                    item = item[1:-1]
+                    updated = True
+                new_partitioned_by.append(item)
+            parsed_snapshot["node"]["partitioned_by"] = new_partitioned_by
+
+        new_snapshots.append(
+            {
+                "name": name,
+                "identifier": identifier,
+                "version": version,
+                "snapshot": json.dumps(parsed_snapshot),
+                "kind_name": kind_name,
+                "updated_ts": updated_ts,
+                "unpaused_ts": unpaused_ts,
+                "ttl_ms": ttl_ms,
+                "unrestorable": unrestorable,
+            }
+        )
+
+    if new_snapshots and updated:
+        engine_adapter.delete_from(snapshots_table, "TRUE")
+        blob_type = blob_text_type(engine_adapter.dialect)
+
+        engine_adapter.insert_append(
+            snapshots_table,
+            pd.DataFrame(new_snapshots),
+            columns_to_types={
+                "name": exp.DataType.build(index_type),
+                "identifier": exp.DataType.build(index_type),
+                "version": exp.DataType.build(index_type),
+                "snapshot": exp.DataType.build(blob_type),
+                "kind_name": exp.DataType.build(index_type),
+                "updated_ts": exp.DataType.build("bigint"),
+                "unpaused_ts": exp.DataType.build("bigint"),
+                "ttl_ms": exp.DataType.build("bigint"),
+                "unrestorable": exp.DataType.build("boolean"),
+            },
+        )
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
@@ -1514,6 +1514,134 @@ def test_render_definition_with_defaults():
     ) == d.format_model_expressions(expected_expressions)
 
 
+def test_render_definition_partitioned_by():
+    # no parenthesis in definition, no parenthesis when rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by a
+        );
+
+        select 1 as a;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by "a"
+)"""
+    )
+
+    # single column wrapped in parenthesis in defintion, no parenthesis in rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (a)
+        );
+
+        select 1 as a;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by "a"
+)"""
+    )
+
+    # multiple columns wrapped in parenthesis in definition, parenthesis in rendered
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (a, b)
+        );
+
+        select 1 as a, 2 as b;
+        """
+        )
+    )
+
+    assert model.partitioned_by == [exp.column("a", quoted=True), exp.column("b", quoted=True)]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  kind FULL,
+  partitioned_by ("a", "b")
+)"""
+    )
+
+    # multiple columns not wrapped in parenthesis in the definition is an error
+    with pytest.raises(ParseError, match=r"keyword: 'value' missing"):
+        load_sql_based_model(
+            d.parse(
+                f"""
+            MODEL (
+                name db.table,
+                kind FULL,
+                partitioned_by a, b
+            );
+
+            select 1 as a, 2 as b;
+            """
+            )
+        )
+
+    # Iceberg transforms / functions
+    model = load_sql_based_model(
+        d.parse(
+            f"""
+        MODEL (
+            name db.table,
+            kind FULL,
+            partitioned_by (day(a), truncate(b, 4), bucket(c, 3))
+        );
+
+        select 1 as a, 2 as b, 3 as c;
+        """
+        ),
+        dialect="trino",
+    )
+
+    assert model.partitioned_by == [
+        exp.Day(this=exp.column("a", quoted=True)),
+        exp.PartitionByTruncate(
+            this=exp.column("b", quoted=True), expression=exp.Literal.number(4)
+        ),
+        exp.PartitionedByBucket(
+            this=exp.column("c", quoted=True), expression=exp.Literal.number(3)
+        ),
+    ]
+    assert (
+        model.render_definition()[0].sql(pretty=True)
+        == """MODEL (
+  name db.table,
+  dialect trino,
+  kind FULL,
+  partitioned_by (DAY("a"), TRUNCATE("b", 4), BUCKET("c", 3))
+)"""
+    )
+
+
 def test_cron():
     daily = _Node(name="x", cron="@daily")
     assert to_datetime(daily.cron_prev("2020-01-01")) == to_datetime("2019-12-31")
diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py
@@ -2938,3 +2938,37 @@ def check_types(batch, env: str, sql: list[SQL], table: exp.Table, default: int
     )
     snapshot_a = make_snapshot(sql_model)
     assert snapshot_a.check_ready_intervals([(0, 1)], mocker.Mock()) == [(0, 1)]
+
+
+def test_partitioned_by_roundtrip(make_snapshot: t.Callable):
+    sql_model = load_sql_based_model(
+        parse("""
+        MODEL (
+            name test_schema.test_model,
+            kind full,
+            partitioned_by (a, bucket(4, b), truncate(3, c), month(d))
+        );
+        SELECT a, b, c, d FROM tbl;
+        """)
+    )
+    snapshot = make_snapshot(sql_model)
+    assert isinstance(snapshot, Snapshot)
+    assert isinstance(snapshot.node, SqlModel)
+
+    assert snapshot.node.partitioned_by == [
+        exp.column("a", quoted=True),
+        exp.PartitionedByBucket(
+            this=exp.column("b", quoted=True), expression=exp.Literal.number(4)
+        ),
+        exp.PartitionByTruncate(
+            this=exp.column("c", quoted=True), expression=exp.Literal.number(3)
+        ),
+        exp.Month(this=exp.column("d", quoted=True)),
+    ]
+
+    # roundtrip through json and ensure we get correct AST nodes on the other end
+    serialized = snapshot.json()
+    deserialized = snapshot.parse_raw(serialized)
+
+    assert isinstance(deserialized.node, SqlModel)
+    assert deserialized.node.partitioned_by == snapshot.node.partitioned_by