feat: Allow to automatically run group rules on primary key (#300)

borchero · web-flow · commit 2534bf7d3b2d · 2026-03-23T17:42:10.000+01:00
diff --git a/dataframely/_rule.py b/dataframely/_rule.py
@@ -6,7 +6,7 @@
 import sys
 from collections import defaultdict
 from collections.abc import Callable
-from typing import Any
+from typing import Any, Literal
 
 import polars as pl
 
@@ -99,7 +99,9 @@ class RuleFactory:
     """Factory class for rules created within schemas."""
 
     def __init__(
-        self, validation_fn: Callable[[Any], pl.Expr], group_columns: list[str] | None
+        self,
+        validation_fn: Callable[[Any], pl.Expr],
+        group_columns: list[str] | Literal["primary_key"] | None,
     ) -> None:
         self.validation_fn = validation_fn
         self.group_columns = group_columns
@@ -116,16 +118,28 @@ def from_rule(cls, rule: Rule) -> Self:
 
     def make(self, schema: Any) -> Rule:
         """Create a new rule from this factory."""
-        if self.group_columns is not None:
+        group_columns: list[str] | None
+        if self.group_columns == "primary_key":
+            from dataframely.exc import ImplementationError
+
+            group_columns = schema.primary_key()
+            if not group_columns:
+                raise ImplementationError(
+                    "Rule uses `group_by='primary_key'` but the schema has no"
+                    " primary key."
+                )
+        else:
+            group_columns = self.group_columns
+        if group_columns is not None:
             return GroupRule(
                 expr=lambda: self.validation_fn(schema),
-                group_columns=self.group_columns,
+                group_columns=group_columns,
             )
         return Rule(expr=lambda: self.validation_fn(schema))
 
 
 def rule(
-    *, group_by: list[str] | None = None
+    *, group_by: list[str] | Literal["primary_key"] | None = None
 ) -> Callable[[ValidationFunction], RuleFactory]:
     """Mark a function as a rule to evaluate during validation.
 
@@ -147,7 +161,10 @@ def rule(
         group_by: An optional list of columns to group by for rules operating on groups
             of rows. If this list is provided, the returned expression must return a
             single boolean value, i.e. some kind of aggregation function must be used
-            (e.g. `sum`, `any`, ...).
+            (e.g. `sum`, `any`, ...). Pass ``"primary_key"`` to dynamically resolve to
+            the schema's primary key columns at class creation time. This is useful for
+            defining rules in mixin classes where the primary key is not known at
+            definition time.
 
     Note:
         You'll need to explicitly handle `null` values in your columns when defining
diff --git a/tests/schema/test_rule_implementation.py b/tests/schema/test_rule_implementation.py
@@ -29,6 +29,66 @@ def test_group_rule_group_by_error() -> None:
         )
 
 
+def test_group_rule_primary_key_single() -> None:
+    class MySchema(dy.Schema):
+        a = dy.Int64(primary_key=True)
+        b = dy.Int64()
+
+        @dy.rule(group_by="primary_key")
+        def b_positive(cls) -> pl.Expr:
+            return (pl.col("b") > 0).all()
+
+    rules = MySchema._schema_validation_rules()
+    assert isinstance(rules["b_positive"], GroupRule)
+    assert rules["b_positive"].group_columns == ["a"]
+
+
+def test_group_rule_primary_key_composite() -> None:
+    class MySchema(dy.Schema):
+        a = dy.Int64(primary_key=True)
+        b = dy.Int64(primary_key=True)
+        c = dy.Int64()
+
+        @dy.rule(group_by="primary_key")
+        def c_positive(cls) -> pl.Expr:
+            return (pl.col("c") > 0).all()
+
+    rules = MySchema._schema_validation_rules()
+    assert isinstance(rules["c_positive"], GroupRule)
+    assert sorted(rules["c_positive"].group_columns) == ["a", "b"]
+
+
+def test_group_rule_primary_key_no_pk() -> None:
+    with pytest.raises(
+        ImplementationError,
+        match=r"group_by='primary_key'.*no primary key",
+    ):
+
+        class MySchema(dy.Schema):
+            a = dy.Int64()
+
+            @dy.rule(group_by="primary_key")
+            def a_positive(cls) -> pl.Expr:
+                return (pl.col("a") > 0).all()
+
+
+def test_group_rule_primary_key_mixin() -> None:
+    class MyMixin:
+        id = dy.Int64(primary_key=True)
+        value = dy.Int64()
+
+        @dy.rule(group_by="primary_key")
+        def value_positive(cls) -> pl.Expr:
+            return (pl.col("value") > 0).all()
+
+    class MySchema(MyMixin, dy.Schema):
+        other_id = dy.Int64(primary_key=True)
+
+    rules = MySchema._schema_validation_rules()
+    assert isinstance(rules["value_positive"], GroupRule)
+    assert rules["value_positive"].group_columns == ["id", "other_id"]
+
+
 def test_rule_column_overlap_error() -> None:
     with pytest.raises(
         ImplementationError,