Add tests for BigFrame and Snowpark dataframes

erindru · erindru · commit 2cf32bcb34c7 · 2025-05-13T00:38:19.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,9 +40,9 @@ athena = ["PyAthena[Pandas]"]
 azuresql = ["pymssql"]
 bigquery = [
     "google-cloud-bigquery[pandas]",
-    "google-cloud-bigquery-storage"
+    "google-cloud-bigquery-storage",
+    "bigframes>=1.32.0"
 ]
-bigframes = ["bigframes>=1.32.0"]
 clickhouse = ["clickhouse-connect"]
 databricks = ["databricks-sql-connector[pyarrow]"]
 dev = [
@@ -107,8 +107,7 @@ slack = ["slack_sdk"]
 snowflake = [
     "cryptography",
     "snowflake-connector-python[pandas,secure-local-storage]",
-    # as at 2024-08-05, snowflake-snowpark-python is only available up to Python 3.11
-    "snowflake-snowpark-python; python_version<'3.12'",
+    "snowflake-snowpark-python",
 ]
 trino = ["trino"]
 web = [
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -249,7 +249,7 @@ def _df_to_source_queries(
 
         # we need to ensure that the order of the columns in columns_to_types columns matches the order of the values
         # they can differ if a user specifies columns() on a python model in a different order than what's in the DataFrame's emitted by that model
-        df = df[list(columns_to_types.keys())]
+        df = df[list(columns_to_types)]
         values = list(df.itertuples(index=False, name=None))
 
         return [
diff --git a/sqlmesh/core/engine_adapter/mssql.py b/sqlmesh/core/engine_adapter/mssql.py
@@ -219,7 +219,7 @@ def query_factory() -> Query:
             if not self.table_exists(temp_table):
                 columns_to_types_create = columns_to_types.copy()
                 ordered_df = df[
-                    list(columns_to_types_create.keys())
+                    list(columns_to_types_create)
                 ]  # reorder DataFrame so it matches columns_to_types
                 self._convert_df_datetime(ordered_df, columns_to_types_create)
                 self.create_table(temp_table, columns_to_types_create)
diff --git a/sqlmesh/core/engine_adapter/snowflake.py b/sqlmesh/core/engine_adapter/snowflake.py
@@ -288,8 +288,25 @@ def _df_to_source_queries(
         is_snowpark_dataframe = snowpark and isinstance(df, snowpark.dataframe.DataFrame)
 
         def query_factory() -> Query:
+            # The catalog needs to be normalized before being passed to Snowflake's library functions because they
+            # just wrap whatever they are given in quotes without checking if its already quoted
+            database = (
+                normalize_identifiers(temp_table.catalog, dialect=self.dialect)
+                if temp_table.catalog
+                else None
+            )
+
             if is_snowpark_dataframe:
-                df.createOrReplaceTempView(temp_table.sql(dialect=self.dialect, identify=True))  # type: ignore
+                temp_table.set("catalog", database)
+                df_renamed = df.rename(
+                    {
+                        col: exp.to_identifier(col, quoted=True).sql(dialect=self.dialect)
+                        for col in columns_to_types
+                    }
+                )  # type: ignore
+                df_renamed.createOrReplaceTempView(
+                    temp_table.sql(dialect=self.dialect, identify=True)
+                )  # type: ignore
             elif isinstance(df, pd.DataFrame):
                 from snowflake.connector.pandas_tools import write_pandas
 
@@ -325,11 +342,7 @@ def query_factory() -> Query:
                     df,
                     temp_table.name,
                     schema=temp_table.db or None,
-                    database=normalize_identifiers(temp_table.catalog, dialect=self.dialect).sql(
-                        dialect=self.dialect
-                    )
-                    if temp_table.catalog
-                    else None,
+                    database=database.sql(dialect=self.dialect) if database else None,
                     chunk_size=self.DEFAULT_BATCH_SIZE,
                     overwrite=True,
                     table_type="temp",
diff --git a/sqlmesh/core/engine_adapter/spark.py b/sqlmesh/core/engine_adapter/spark.py
@@ -281,14 +281,14 @@ def _ensure_pyspark_df(
         if pyspark_df:
             if columns_to_types:
                 # ensure Spark dataframe column order matches columns_to_types
-                pyspark_df = pyspark_df.select(*list(columns_to_types.keys()))
+                pyspark_df = pyspark_df.select(*list(columns_to_types))
             return pyspark_df
         df = self.try_get_pandas_df(generic_df)
         if df is None:
             raise SQLMeshError("Ensure PySpark DF can only be run on a PySpark or Pandas DataFrame")
         if columns_to_types:
             # ensure Pandas dataframe column order matches columns_to_types
-            df = df[list(columns_to_types.keys())]
+            df = df[list(columns_to_types)]
         kwargs = (
             dict(schema=self.sqlglot_to_spark_types(columns_to_types)) if columns_to_types else {}
         )
diff --git a/tests/core/engine_adapter/integration/__init__.py b/tests/core/engine_adapter/integration/__init__.py
@@ -359,7 +359,7 @@ def get_table_comment(
                 FROM pg_class c
                 INNER JOIN pg_description d ON c.oid = d.objoid AND d.objsubid = 0
                 INNER JOIN pg_namespace n ON c.relnamespace = n.oid
-                WHERE 
+                WHERE
                     c.relname = '{table_name}'
                     AND n.nspname= '{schema_name}'
                     AND c.relkind = '{"v" if table_kind == "VIEW" else "r"}'
@@ -465,12 +465,12 @@ def get_column_comments(
                 INNER JOIN pg_namespace n ON c.relnamespace = n.oid
                 INNER JOIN pg_attribute a ON c.oid = a.attrelid
                 INNER JOIN pg_description d
-                ON 
+                ON
                     a.attnum = d.objsubid
                     AND d.objoid = c.oid
                 WHERE
                     n.nspname = '{schema_name}'
-                    AND c.relname = '{table_name}' 
+                    AND c.relname = '{table_name}'
                     AND c.relkind = '{"v" if table_kind == "VIEW" else "r"}'
                 ;
             """
@@ -494,6 +494,7 @@ def create_context(
         self,
         config_mutator: t.Optional[t.Callable[[str, Config], None]] = None,
         path: t.Optional[pathlib.Path] = None,
+        ephemeral_state_connection: bool = True,
     ) -> Context:
         private_sqlmesh_dir = pathlib.Path(pathlib.Path().home(), ".sqlmesh")
         config = load_config_from_paths(
@@ -509,14 +510,12 @@ def create_context(
         config.gateways = {self.gateway: config.gateways[self.gateway]}
 
         gateway_config = config.gateways[self.gateway]
-        if (
-            (sc := gateway_config.state_connection)
-            and (conn := gateway_config.connection)
-            and sc.type_ == "duckdb"
-        ):
-            # if duckdb is being used as the state connection, set concurrent_tasks=1 on the main connection
-            # to prevent duckdb from being accessed from multiple threads and getting deadlocked
-            conn.concurrent_tasks = 1
+        if ephemeral_state_connection:
+            # Override whatever state connection has been configured on the integration test config to use in-memory DuckDB instead
+            # This is so tests that initialize a SQLMesh context can run concurrently without clobbering each others state
+            from sqlmesh.core.config.connection import DuckDBConnectionConfig
+
+            gateway_config.state_connection = DuckDBConnectionConfig()
 
         if "athena" in self.gateway:
             conn = gateway_config.connection
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2721,7 +2721,9 @@ def _use_warehouse_as_state_connection(gateway_name: str, config: Config):
 
         config.gateways[gateway_name].state_schema = test_schema
 
-    sqlmesh_context = ctx.create_context(config_mutator=_use_warehouse_as_state_connection)
+    sqlmesh_context = ctx.create_context(
+        config_mutator=_use_warehouse_as_state_connection, ephemeral_state_connection=False
+    )
     assert sqlmesh_context.config.get_state_schema(ctx.gateway) == test_schema
 
     state_sync = (
@@ -2742,8 +2744,7 @@ def test_python_model_column_order(ctx: TestContext, tmp_path_factory: pytest.Te
         pytest.skip("python model column order test only needs to be run once per db")
 
     tmp_path = tmp_path_factory.mktemp(f"column_order_{ctx.test_id}")
-
-    test_schema = ctx.add_test_suffix("column_order")
+    schema = ctx.add_test_suffix(TEST_SCHEMA)
 
     (tmp_path / "models").mkdir()
 
@@ -2772,7 +2773,7 @@ def execute(
     return context.spark.createDataFrame([
         Row(name="foo", id=1)
     ])
-    """.replace("TEST_SCHEMA", test_schema)
+    """.replace("TEST_SCHEMA", schema)
         )
     else:
         # python model that emits a Pandas DataFrame
@@ -2796,7 +2797,7 @@ def execute(
     return pd.DataFrame([
         {"name": "foo", "id": 1}
     ])
-    """.replace("TEST_SCHEMA", test_schema)
+    """.replace("TEST_SCHEMA", schema)
         )
 
     sqlmesh_ctx = ctx.create_context(path=tmp_path)
@@ -2808,6 +2809,9 @@ def execute(
 
     engine_adapter = sqlmesh_ctx.engine_adapter
 
-    df = engine_adapter.fetchdf(f"select * from {test_schema}.model")
+    query = exp.select("*").from_(
+        exp.to_table(f"{schema}.model", dialect=ctx.dialect), dialect=ctx.dialect
+    )
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
     assert len(df) == 1
     assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_bigquery.py b/tests/core/engine_adapter/integration/test_integration_bigquery.py
@@ -433,3 +433,50 @@ def test_table_diff_table_name_matches_column_name(ctx: TestContext):
 
     assert row_diff.stats["join_count"] == 1
     assert row_diff.full_match_count == 1
+
+
+def test_bigframe_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    model_name = ctx.table("TEST")
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a BigFrame dataframe
+    model_path.write_text(
+        """
+from bigframes.pandas import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    'MODEL_NAME',
+    columns={
+        "id": "int",
+        "name": "varchar"
+    },
+    dialect="bigquery"
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return DataFrame({'name': ['foo'], 'id': [1]}, session=context.bigframe)
+""".replace("MODEL_NAME", model_name.sql(dialect="bigquery"))
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(model_name)
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/integration/test_integration_snowflake.py b/tests/core/engine_adapter/integration/test_integration_snowflake.py
@@ -1,6 +1,7 @@
 import typing as t
 import pytest
 from sqlglot import exp
+from pathlib import Path
 from sqlglot.optimizer.qualify_columns import quote_identifiers
 from sqlglot.helper import seq_get
 from sqlmesh.core.engine_adapter import SnowflakeEngineAdapter
@@ -210,3 +211,49 @@ def test_create_iceberg_table(ctx: TestContext, engine_adapter: SnowflakeEngineA
     result = sqlmesh.plan(auto_apply=True)
 
     assert len(result.new_snapshots) == 2
+
+
+def test_snowpark_python_model_column_order(ctx: TestContext, tmp_path: Path):
+    model_name = ctx.table("TEST")
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    model_path = tmp_path / "models" / "python_model.py"
+
+    # python model that emits a Snowpark DataFrame
+    model_path.write_text(
+        """
+from snowflake.snowpark.dataframe import DataFrame
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    'MODEL_NAME',
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> DataFrame:
+    return context.snowpark.create_dataframe([["foo", 1]], schema=["name", "id"])
+""".replace("MODEL_NAME", model_name.sql(dialect="snowflake"))
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    query = exp.select("*").from_(plan.environment.snapshots[0].fully_qualified_table)
+    df = engine_adapter.fetchdf(query, quote_identifiers=True)
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}
diff --git a/tests/core/engine_adapter/test_snowflake.py b/tests/core/engine_adapter/test_snowflake.py
@@ -424,6 +424,10 @@ def test_replace_query_snowpark_dataframe(
     from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame
 
     session = Session.builder.config("local_testing", True).create()
+    # df.createOrReplaceTempView() throws "[Local Testing] Mocking SnowflakePlan Rename is not supported" when used against the Snowflake local_testing session
+    # since we cant trace any queries from the Snowpark library anyway, we just suppress this and verify the cleanup queries issued by our EngineAdapter
+    session._conn._suppress_not_implemented_error = True
+
     df: SnowparkDataFrame = session.create_dataframe([(1, "name")], schema=["ID", "NAME"])
     assert isinstance(df, SnowparkDataFrame)
 
@@ -439,11 +443,6 @@ def test_replace_query_snowpark_dataframe(
         columns_to_types={"ID": exp.DataType.build("INT"), "NAME": exp.DataType.build("VARCHAR")},
     )
 
-    # the Snowflake library generates "CREATE TEMPORARY VIEW" from a direct DataFrame call
-    # which doesnt pass through our EngineAdapter so we cant capture it
-    spy.assert_called()
-    assert "__temp_foo_e6wjkjj6" in spy.call_args[0][0]
-
     # verify that DROP VIEW is called instead of DROP TABLE
     assert to_sql_calls(adapter) == [
         'CREATE OR REPLACE TABLE "foo" AS SELECT CAST("ID" AS INT) AS "ID", CAST("NAME" AS VARCHAR) AS "NAME" FROM (SELECT CAST("ID" AS INT) AS "ID", CAST("NAME" AS VARCHAR) AS "NAME" FROM "__temp_foo_e6wjkjj6") AS "_subquery"',