feat: Expose a user-friendly version of FailureInfo._lf in FailureInfo.violation_details() (#275)

AndreasAlbertQC · borchero · web-flow · commit 573b56454b49 · 2026-02-16T18:07:09.000Z
Co-authored-by: Oliver Borchert &lt;oliver.borchert@quantco.com&gt;
diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py
@@ -114,6 +114,29 @@ def invalid(self) -> pl.DataFrame:
         """The rows of the original data frame containing the invalid rows."""
         return self._df.drop(self._rule_columns)
 
+    def details(self) -> pl.DataFrame:
+        """Same as :meth:`invalid` but with additional columns indicating the results of
+        each individual rule.
+
+        For each row, this includes:
+            1. All columns of the original data frame.
+            2. One column for each rule indicating whether the value of the column
+             is `valid`, `invalid`, or `unknown`.
+
+        If a rule column has a value of `unknown` for a given row, that means the rule
+        could not be evaluated reliably.
+        This may happen when calling :meth:`Collection.filter` with collection-level
+        filters in addition to member-level rules, or when calling :meth:`Schema.filter`
+        with `cast=True` and dtype-casting fails for a value.
+        """
+        return self._df.select(
+            pl.exclude(self._rule_columns),
+            pl.col(*self._rule_columns).replace_strict(
+                {True: "valid", False: "invalid", None: "unknown"},
+                return_dtype=pl.Enum(["valid", "invalid", "unknown"]),
+            ),
+        )
+
     def counts(self) -> dict[str, int]:
         """The number of validation failures for each individual rule.
 
diff --git a/docs/guides/features/serialization.md b/docs/guides/features/serialization.md
@@ -139,7 +139,7 @@ class HouseSchema(dy.Schema):
     price = dy.Float64(nullable=False)
 
     @dy.rule()
-    def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
+    def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
         ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
         return (ratio >= 1 / 3) & (ratio <= 3)
 
@@ -190,9 +190,9 @@ json.loads(HouseSchema.serialize())
                           'primary_key': False,
                           'regex': None}},
  'name': 'HouseSchema',
- 'rules': {'reasonable_bathroom_to_bedrooom_ratio': {'expr': {'__type__': 'expression',
-                                                              'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
-                                                     'rule_type': 'Rule'}},
+ 'rules': {'reasonable_bathroom_to_bedroom_ratio': {'expr': {'__type__': 'expression',
+                                                             'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
+                                                    'rule_type': 'Rule'}},
  'versions': {'dataframely': '2.0.0', 'format': '1', 'polars': '1.33.1'}}
 ```
 
diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md
@@ -54,13 +54,13 @@ class HouseSchema(dy.Schema):
     price = dy.Float64(nullable=False)
 
     @dy.rule()
-    def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
+    def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
         ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
         return (ratio >= 1 / 3) & (ratio <= 3)
 ```
 
 The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e.
-`reasonable_bathroom_to_bedrooom_ratio`).
+`reasonable_bathroom_to_bedroom_ratio`).
 The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data
 are valid with respect to this rule.
 
@@ -81,7 +81,7 @@ class HouseSchema(dy.Schema):
     price = dy.Float64(nullable=False)
 
     @dy.rule()
-    def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
+    def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
         ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
         return (ratio >= 1 / 3) & (ratio <= 3)
 
@@ -189,7 +189,7 @@ Using the `counts` method on the :class:`~dataframely.FailureInfo` object will r
 
 ```python
 {
-    "reasonable_bathroom_to_bedrooom_ratio": 1,
+    "reasonable_bathroom_to_bedroom_ratio": 1,
     "minimum_zip_code_count": 2,
     "zip_code|min_length": 1,
     "num_bedrooms|nullability": 2,
@@ -205,6 +205,19 @@ failed_df = failure.invalid()
 This information tends to be very useful in tracking down issues with the data,
 both in productive systems and analytics environments.
 
+```{note}
+New in `dataframely` v2.8.0: The `FailureInfo.details()` method now returns additional columns indicating which rules were violated for each row.
+```
+
+For the example above, `failure.details()` would look as follows (we omitted some columns for readability):
+
+| zip_code | num_bedrooms | num_bathrooms | price  | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... |
+| -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- |
+| 1        | 1            | 1             | 50000  | valid                             | invalid                | invalid              | valid                     |     |
+| 213      | null         | 1             | 80000  | valid                             | valid                  | valid                | invalid                   |     |
+| 123      | null         | 0             | 60000  | valid                             | invalid                | valid                | invalid                   |     |
+| 213      | 2            | 8             | 160000 | invalid                           | valid                  | valid                | valid                     |     |
+
 ## Type casting
 
 In rare cases, you might already be _absolutely certain_ that a data frame is valid with
@@ -229,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2]))
 Lastly, `dataframely` schemas can be used to integrate with external tools:
 
 - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing
-- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to
+- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be
+  used to
   create SQL tables using types and constraints in line with the schema
 - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with
   appropriate column dtypes and nullability information
diff --git a/tests/collection/test_filter_validate.py b/tests/collection/test_filter_validate.py
@@ -304,3 +304,57 @@ def test_maintain_order() -> None:
     out = MyShufflingCollection.validate(out.to_dict())
     assert out.first.select("a").collect().to_series().is_sorted()
     assert out.second.select("a").collect().to_series().is_sorted()
+
+
+def test_unknown_rule_outcomes(
+    data_without_filter_with_rule_violation: tuple[pl.DataFrame, pl.DataFrame],
+) -> None:
+    _, fails = MyCollection.filter(
+        {
+            "first": data_without_filter_with_rule_violation[0],
+            "second": data_without_filter_with_rule_violation[1],
+        }
+    )
+    assert fails["first"].details().to_dicts() == [
+        {
+            "a": 1,
+            "b": 1,
+            "a|nullability": "valid",
+            "b|nullability": "valid",
+            "equal_primary_key": "unknown",
+            "first_b_greater_second_b": "unknown",
+            "primary_key": "invalid",
+        },
+        {
+            "a": 1,
+            "b": 3,
+            "a|nullability": "valid",
+            "b|nullability": "valid",
+            "equal_primary_key": "unknown",
+            "first_b_greater_second_b": "unknown",
+            "primary_key": "invalid",
+        },
+    ]
+
+    assert fails["second"].details().to_dicts() == [
+        {
+            "a": 1,
+            "b": 0,
+            "primary_key": "valid",
+            "a|nullability": "valid",
+            "b|nullability": "valid",
+            "b|min": "invalid",
+            "equal_primary_key": "unknown",
+            "first_b_greater_second_b": "unknown",
+        },
+        {
+            "a": 3,
+            "b": 2,
+            "primary_key": "unknown",
+            "a|nullability": "unknown",
+            "b|nullability": "unknown",
+            "b|min": "unknown",
+            "equal_primary_key": "invalid",
+            "first_b_greater_second_b": "valid",
+        },
+    ]
diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py
@@ -243,3 +243,35 @@ def test_filter_maintain_order(eager: bool) -> None:
     )
     out, _ = _filter_and_collect(schema, df, cast=True, eager=eager)
     assert out.get_column("a").is_sorted()
+
+
+@pytest.mark.parametrize("eager", [True, False])
+def test_filter_details(eager: bool) -> None:
+    df = pl.DataFrame(
+        {
+            "a": [2, 2],
+            "b": ["bar", "foobar"],
+        }
+    )
+    _, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager)
+
+    assert fails.details().to_dicts() == [
+        {
+            "a": 2,
+            "b": "bar",
+            "a|dtype": "valid",
+            "a|nullability": "valid",
+            "b|dtype": "valid",
+            "b|max_length": "valid",
+            "primary_key": "invalid",
+        },
+        {
+            "a": 2,
+            "b": "foobar",
+            "a|dtype": "valid",
+            "a|nullability": "valid",
+            "b|dtype": "valid",
+            "b|max_length": "invalid",
+            "primary_key": "invalid",
+        },
+    ]