Skip to content

Commit 65ed579

Browse files
authored
Fix: 'duplicate definitions' python env bug due to module import order (#4249)
1 parent c1ae64f commit 65ed579

2 files changed

Lines changed: 146 additions & 2 deletions

File tree

sqlmesh/utils/metaprogramming.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ def _code_globals(code: types.CodeType) -> t.Dict[str, None]:
6161
return variables
6262

6363

64+
def _globals_match(obj1: t.Any, obj2: t.Any) -> bool:
65+
return type(obj1) == type(obj2) and (
66+
obj1 == obj2
67+
or (
68+
getattr(obj1, "__module__", None) == getattr(obj2, "__module__", None)
69+
and getattr(obj1, "__name__", None) == getattr(obj2, "__name__", None)
70+
)
71+
)
72+
73+
6474
def func_globals(func: t.Callable) -> t.Dict[str, t.Any]:
6575
"""Finds all global references and closures in a function and nested functions.
6676
@@ -287,9 +297,17 @@ def build_env(
287297

288298
def walk(obj: t.Any, name: str, is_metadata: t.Optional[bool] = None) -> None:
289299
obj_module = inspect.getmodule(obj)
290-
if name in visited or (obj_module and obj_module.__name__ == "builtins"):
300+
if obj_module and obj_module.__name__ == "builtins":
291301
return
292302

303+
if name in visited:
304+
if name not in env or _globals_match(env[name][0], obj):
305+
return
306+
307+
raise SQLMeshError(
308+
f"Cannot store {obj} in environment, duplicate definitions found for '{name}'"
309+
)
310+
293311
visited.add(name)
294312
name_missing_from_env = name not in env
295313

@@ -320,7 +338,7 @@ def walk(obj: t.Any, name: str, is_metadata: t.Optional[bool] = None) -> None:
320338
):
321339
env[name] = (obj, is_metadata)
322340
return
323-
elif env[name][0] != obj:
341+
elif not _globals_match(env[name][0], obj):
324342
raise SQLMeshError(
325343
f"Cannot store {obj} in environment, duplicate definitions found for '{name}'"
326344
)

tests/core/test_model.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9235,3 +9235,129 @@ def test_runtime_stage(evaluator):
92359235
model = load_sql_based_model(expressions, jinja_macros=jinja_macros)
92369236
assert model.render_query().sql() == "SELECT 'loading' AS a, 'loading_bla' AS b"
92379237
assert set(model.python_env) == {"noop", "test_runtime_stage"}
9238+
9239+
9240+
def test_python_env_references_are_unequal_but_point_to_same_definition(tmp_path: Path) -> None:
9241+
# This tests for regressions against an edge case bug which was due to reloading modules
9242+
# in sqlmesh.utils.metaprogramming.import_python_file. Depending on the module loading
9243+
# order, we could get a "duplicate symbol in python env" error, even though the references
9244+
# essentially pointed to the same definition (e.g. function or class).
9245+
init_example_project(tmp_path, dialect="duckdb", template=ProjectTemplate.EMPTY)
9246+
9247+
db_path = str(tmp_path / "db.db")
9248+
db_connection = DuckDBConnectionConfig(database=db_path)
9249+
config = Config(
9250+
gateways={"duckdb": GatewayConfig(connection=db_connection)},
9251+
model_defaults=ModelDefaultsConfig(dialect="duckdb"),
9252+
)
9253+
9254+
file_a = tmp_path / "macros" / "a.py"
9255+
file_b = tmp_path / "macros" / "b.py"
9256+
file_c = tmp_path / "macros" / "c.py"
9257+
9258+
file_a.write_text(
9259+
"""from macros.c import target
9260+
9261+
def f1():
9262+
target()
9263+
"""
9264+
)
9265+
file_b.write_text(
9266+
"""from sqlmesh import macro
9267+
9268+
from macros.a import f1
9269+
from macros.c import target
9270+
9271+
@macro()
9272+
def first_macro(evaluator):
9273+
f1()
9274+
9275+
@macro()
9276+
def second_macro(evaluator):
9277+
target()
9278+
"""
9279+
)
9280+
file_c.write_text(
9281+
"""def target():
9282+
pass
9283+
"""
9284+
)
9285+
9286+
model_file = tmp_path / "models" / "model.sql"
9287+
model_file.write_text("MODEL (name a); @first_macro(); @second_macro(); SELECT 1 AS c")
9288+
9289+
ctx = Context(paths=tmp_path, config=config, load=False)
9290+
loader = ctx._loaders[0]
9291+
9292+
original_glob_paths = loader._glob_paths
9293+
9294+
def _patched_glob_paths(path, *args, **kwargs):
9295+
if path == tmp_path / "macros":
9296+
yield from [file_a, file_c, file_b]
9297+
else:
9298+
yield from original_glob_paths(path, *args, **kwargs)
9299+
9300+
# We force the import order to be a.py -> c.py -> b.py:
9301+
#
9302+
# 1. a.py is loaded, so "macros", "macros.a" and "macros.c" are loaded in sys.modules
9303+
# 2. c.py is loaded, so "macros" and "macros.c" are reloaded in sys.modules
9304+
# 3. b.py is loaded, so "macros" is reloaded and "macros.b" is loaded in sys.modules
9305+
#
9306+
# (1) => id(sys.modules["macros.a"].target) == id(sys.modules["macros.c"].target) == X
9307+
# (2) => id(sys.modules["macros.c"].target) == Y != X == id(sys.modules["macros.a"].target)
9308+
# (3) => affects neither sys.modules["macros.a"] nor sys.modules["macros.c"], just loads the macros
9309+
#
9310+
# At this point we have two different function instances, one in sys.modules["macros.a"] and one
9311+
# in sys.modules["macros.c"], which encapsulate the same definition (source code). This used to
9312+
# lead to a crash, because we prohibit unequal objects with the same name in the python env.
9313+
with patch.object(loader, "_glob_paths", side_effect=_patched_glob_paths):
9314+
ctx.load()
9315+
9316+
model = ctx.models['"a"']
9317+
python_env = model.python_env
9318+
9319+
assert len(python_env) == 4
9320+
assert python_env.get("target") == Executable(
9321+
payload="def target():\n pass", name="target", path="macros/c.py"
9322+
)
9323+
9324+
9325+
def test_unequal_duplicate_python_env_references_are_prohibited(tmp_path: Path) -> None:
9326+
init_example_project(tmp_path, dialect="duckdb", template=ProjectTemplate.EMPTY)
9327+
9328+
db_path = str(tmp_path / "db.db")
9329+
db_connection = DuckDBConnectionConfig(database=db_path)
9330+
config = Config(
9331+
gateways={"duckdb": GatewayConfig(connection=db_connection)},
9332+
model_defaults=ModelDefaultsConfig(dialect="duckdb"),
9333+
)
9334+
9335+
file_a = tmp_path / "macros" / "unimportant_macro.py"
9336+
file_b = tmp_path / "macros" / "just_f.py"
9337+
9338+
file_a.write_text(
9339+
"""from sqlmesh import macro
9340+
from macros.just_f import f
9341+
9342+
a = False
9343+
9344+
@macro()
9345+
def unimportant_macro(evaluator):
9346+
print(a)
9347+
f()
9348+
return 1
9349+
"""
9350+
)
9351+
file_b.write_text(
9352+
"""a = 0
9353+
9354+
def f():
9355+
print(a)
9356+
"""
9357+
)
9358+
9359+
model_file = tmp_path / "models" / "model.sql"
9360+
model_file.write_text("MODEL (name m); SELECT @unimportant_macro() AS unimportant_macro")
9361+
9362+
with pytest.raises(SQLMeshError, match=r"duplicate definitions found"):
9363+
Context(paths=tmp_path, config=config)

0 commit comments

Comments
 (0)