Skip to content

Commit 3e93705

Browse files
authored
Fix: handle duplicate model names across sql, python and external models (#3945)
1 parent 6056dbf commit 3e93705

2 files changed

Lines changed: 218 additions & 5 deletions

File tree

sqlmesh/core/loader.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import os
88
import typing as t
9-
from collections import defaultdict
9+
from collections import Counter, defaultdict
1010
from dataclasses import dataclass
1111
from pathlib import Path
1212

@@ -396,11 +396,16 @@ def _load_models(
396396
Loads all of the models within the model directory with their associated
397397
audits into a Dict and creates the dag
398398
"""
399-
models = self._load_sql_models(macros, jinja_macros, audits, signals)
400-
models.update(self._load_external_models(audits, gateway))
401-
models.update(self._load_python_models(macros, jinja_macros, audits, signals))
399+
sql_models = self._load_sql_models(macros, jinja_macros, audits, signals)
400+
external_models = self._load_external_models(audits, gateway)
401+
python_models = self._load_python_models(macros, jinja_macros, audits, signals)
402402

403-
return models
403+
all_model_names = list(sql_models) + list(external_models) + list(python_models)
404+
duplicates = [name for name, count in Counter(all_model_names).items() if count > 1]
405+
if duplicates:
406+
raise ValueError(f"Duplicate model name(s) found: {', '.join(duplicates)}.")
407+
408+
return UniqueKeyDict("models", **sql_models, **external_models, **python_models)
404409

405410
def _load_sql_models(
406411
self,

tests/core/test_loader.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import pytest
2+
from pathlib import Path
3+
from sqlmesh.cli.example_project import init_example_project
4+
from sqlmesh.core.config import Config, ModelDefaultsConfig
5+
from sqlmesh.core.context import Context
6+
from sqlmesh.utils.errors import ConfigError
7+
8+
9+
@pytest.fixture
10+
def sample_models(request):
11+
models = {
12+
"sql": {
13+
"contents": """
14+
MODEL (
15+
name test_schema.test_model,
16+
kind FULL,
17+
);
18+
19+
SELECT 1;
20+
""",
21+
"path": "models/sql_model.sql",
22+
},
23+
"python": {
24+
"contents": """import typing as t
25+
import pandas as pd
26+
from sqlmesh import ExecutionContext, model
27+
28+
@model(
29+
"test_schema.test_model",
30+
kind="FULL",
31+
columns={
32+
"id": "int",
33+
}
34+
)
35+
def execute(
36+
context: ExecutionContext,
37+
**kwargs: t.Any,
38+
) -> pd.DataFrame:
39+
return pd.DataFrame([
40+
{"id": 1}
41+
])
42+
""",
43+
"path": "models/python_model.py",
44+
},
45+
"external": {
46+
"contents": """
47+
- name: test_schema.test_model
48+
columns:
49+
id: INT
50+
""",
51+
"path": "external_models/external_model.yaml",
52+
},
53+
}
54+
requested_models = request.param.split("_")
55+
return [v for k, v in models.items() if k in requested_models]
56+
57+
58+
@pytest.mark.parametrize(
59+
"sample_models",
60+
["sql_python", "python_external", "sql_external", "sql_python_external"],
61+
indirect=True,
62+
)
63+
def test_duplicate_model_names_different_kind(tmp_path: Path, sample_models):
64+
"""Test different (SQL, Python and external) models with duplicate model names raises ValueError."""
65+
model_1, *models = sample_models
66+
if len(models) == 2:
67+
model_2, model_3 = models
68+
else:
69+
model_2, model_3 = models[0], None
70+
71+
init_example_project(tmp_path, dialect="duckdb")
72+
config = Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
73+
74+
path_1: Path = tmp_path / model_1["path"]
75+
path_2: Path = tmp_path / model_2["path"]
76+
77+
path_1.parent.mkdir(parents=True, exist_ok=True)
78+
path_1.write_text(model_1["contents"])
79+
path_2.parent.mkdir(parents=True, exist_ok=True)
80+
path_2.write_text(model_2["contents"])
81+
82+
if model_3:
83+
path_3: Path = tmp_path / model_3["path"]
84+
path_3.parent.mkdir(parents=True, exist_ok=True)
85+
path_3.write_text(model_3["contents"])
86+
87+
with pytest.raises(
88+
ValueError, match=r'Duplicate model name\(s\) found: "memory"."test_schema"."test_model".'
89+
):
90+
Context(paths=tmp_path, config=config)
91+
92+
93+
@pytest.mark.parametrize("sample_models", ["sql", "external"], indirect=True)
94+
def test_duplicate_model_names_same_kind(tmp_path: Path, sample_models):
95+
"""Test same (SQL and external) models with duplicate model names raises ValueError."""
96+
97+
def duplicate_model_path(fpath):
98+
return Path(fpath).parent / ("duplicate" + Path(fpath).suffix)
99+
100+
model = sample_models[0]
101+
init_example_project(tmp_path, dialect="duckdb")
102+
config = Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
103+
104+
path_1: Path = tmp_path / model["path"]
105+
path_1.parent.mkdir(parents=True, exist_ok=True)
106+
path_1.write_text(model["contents"])
107+
108+
duplicate_fpath = tmp_path / duplicate_model_path(model["path"])
109+
duplicate_fpath.write_text(model["contents"])
110+
111+
with pytest.raises(
112+
ValueError,
113+
match=r'Duplicate key \'"memory"."test_schema"."test_model"\' found in UniqueKeyDict<models>. Call dict.update\(\.\.\.\) if this is intentional.',
114+
):
115+
Context(paths=tmp_path, config=config)
116+
117+
118+
@pytest.mark.isolated
119+
def test_duplicate_python_model_names_raise_error(tmp_path: Path) -> None:
120+
"""Test python models with duplicate model names raises ConfigError if the functions are not identical."""
121+
init_example_project(tmp_path, dialect="duckdb")
122+
config = Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
123+
model_name = "test_schema.test_model"
124+
125+
path_a = tmp_path / "models/test_schema/test_model_a.py"
126+
path_b = tmp_path / "models/test_schema/test_model_b.py"
127+
128+
model_payload_a = f"""from sqlmesh import model
129+
@model(
130+
name="{model_name}",
131+
columns={{'"COL"': "int"}},
132+
)
133+
def my_model(context, **kwargs):
134+
pass"""
135+
136+
model_payload_b = f"""import typing as t
137+
import pandas as pd
138+
from sqlmesh import ExecutionContext, model
139+
140+
@model(
141+
name="{model_name}",
142+
kind="FULL",
143+
columns={{
144+
"id": "int",
145+
}}
146+
)
147+
def execute(
148+
context: ExecutionContext,
149+
**kwargs: t.Any,
150+
) -> pd.DataFrame:
151+
return pd.DataFrame([
152+
{{"id": 1}}
153+
])
154+
"""
155+
156+
path_a.parent.mkdir(parents=True, exist_ok=True)
157+
path_a.write_text(model_payload_a)
158+
path_b.write_text(model_payload_b)
159+
160+
with pytest.raises(
161+
ConfigError,
162+
match=r"Failed to load model definition at '.*'.\nDuplicate key 'test_schema.test_model' found in UniqueKeyDict<python_models>.",
163+
):
164+
Context(paths=tmp_path, config=config)
165+
166+
167+
@pytest.mark.slow
168+
def test_duplicate_python_model_names_no_error(tmp_path: Path) -> None:
169+
"""Test python models with duplicate model names raises no error if the functions are identical."""
170+
init_example_project(tmp_path, dialect="duckdb")
171+
config = Config(model_defaults=ModelDefaultsConfig(dialect="duckdb"))
172+
model_name = "test_schema.test_model"
173+
174+
path_a = tmp_path / "models/test_schema1/test_model_a.py"
175+
path_b = tmp_path / "models/test_schema2/test_model_b.py"
176+
177+
model_payload_a = f"""from sqlmesh import model
178+
@model(
179+
name="{model_name}",
180+
columns={{'"COL"': "int"}},
181+
description="model_payload_a",
182+
)
183+
def my_model(context, **kwargs):
184+
pass"""
185+
186+
model_payload_b = f"""from sqlmesh import model
187+
@model(
188+
name="{model_name}",
189+
columns={{'"COL"': "int"}},
190+
description="model_payload_b",
191+
)
192+
def my_model(context, **kwargs):
193+
pass"""
194+
195+
path_a.parent.mkdir(parents=True, exist_ok=True)
196+
path_b.parent.mkdir(parents=True, exist_ok=True)
197+
path_a.write_text(model_payload_a)
198+
context = Context(paths=tmp_path, config=config)
199+
context.load()
200+
model = context.get_model(f"{model_name}")
201+
assert model.description == "model_payload_a"
202+
path_b.write_text(model_payload_b)
203+
context.load() # raise no error to duplicate key if the functions are identical (by registry class_method)
204+
model = context.get_model(f"{model_name}")
205+
assert (
206+
model.description != "model_payload_b"
207+
) # model will not be overwritten by model_payload_b
208+
assert model.description == "model_payload_a"

0 commit comments

Comments
 (0)