Fix: Allow python models to emit DataFrame's with a different column order than what is declared in @model

erindru · erindru · commit 33b034516145 · 2025-05-13T00:38:19.000Z
diff --git a/sqlmesh/core/engine_adapter/base.py b/sqlmesh/core/engine_adapter/base.py
@@ -246,7 +246,12 @@ def _df_to_source_queries(
         assert isinstance(df, pd.DataFrame)
         num_rows = len(df.index)
         batch_size = sys.maxsize if batch_size == 0 else batch_size
+
+        # we need to ensure that the order of the columns in columns_to_types columns matches the order of the values
+        # they can differ if a user specifies columns() on a python model in a different order than what's in the DataFrame's emitted by that model
+        df = df[list(columns_to_types.keys())]
         values = list(df.itertuples(index=False, name=None))
+
         return [
             SourceQuery(
                 query_factory=partial(
diff --git a/sqlmesh/core/engine_adapter/mssql.py b/sqlmesh/core/engine_adapter/mssql.py
@@ -218,10 +218,13 @@ def query_factory() -> Query:
             # as later calls.
             if not self.table_exists(temp_table):
                 columns_to_types_create = columns_to_types.copy()
-                self._convert_df_datetime(df, columns_to_types_create)
+                ordered_df = df[
+                    list(columns_to_types_create.keys())
+                ]  # reorder DataFrame so it matches columns_to_types
+                self._convert_df_datetime(ordered_df, columns_to_types_create)
                 self.create_table(temp_table, columns_to_types_create)
                 rows: t.List[t.Tuple[t.Any, ...]] = list(
-                    df.replace({np.nan: None}).itertuples(index=False, name=None)  # type: ignore
+                    ordered_df.replace({np.nan: None}).itertuples(index=False, name=None)  # type: ignore
                 )
                 conn = self._connection_pool.get()
                 conn.bulk_copy(temp_table.sql(dialect=self.dialect), rows)
diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py
@@ -2732,3 +2732,50 @@ def _use_warehouse_as_state_connection(gateway_name: str, config: Config):
 
     # will throw if one of the migrations produces an error, which can happen if we forget to take quoting or normalization into account
     sqlmesh_context.migrate()
+
+
+def test_python_model_column_order(ctx: TestContext, tmp_path: pathlib.Path):
+    if ctx.test_type != "df":
+        pytest.skip("python model column order test only needs to be run once per db")
+
+    test_schema = ctx.add_test_suffix("column_order")
+
+    (tmp_path / "models").mkdir()
+
+    # note: this model deliberately defines the columns in the @model definition to be in a different order than what
+    # is returned by the DataFrame within the model
+    (tmp_path / "models" / "python_model.py").write_text(
+        """
+import pandas as pd
+import typing as t
+from sqlmesh import ExecutionContext, model
+
+@model(
+    "TEST_SCHEMA.model",
+    columns={
+        "id": "int",
+        "name": "varchar"
+    }
+)
+def execute(
+    context: ExecutionContext,
+    **kwargs: t.Any,
+) -> pd.DataFrame:
+    return pd.DataFrame([
+        {"name": "foo", "id": 1}
+    ])
+""".replace("TEST_SCHEMA", test_schema)
+    )
+
+    sqlmesh_ctx = ctx.create_context(path=tmp_path)
+
+    assert len(sqlmesh_ctx.models) == 1
+
+    plan = sqlmesh_ctx.plan(auto_apply=True)
+    assert len(plan.new_snapshots) == 1
+
+    engine_adapter = sqlmesh_ctx.engine_adapter
+
+    df = engine_adapter.fetchdf(f"select * from {test_schema}.model")
+    assert len(df) == 1
+    assert df.iloc[0].to_dict() == {"id": 1, "name": "foo"}