Skip to content

Commit 573b564

Browse files
feat: Expose a user-friendly version of FailureInfo._lf in FailureInfo.violation_details() (#275)
Co-authored-by: Oliver Borchert <oliver.borchert@quantco.com>
1 parent bc449c2 commit 573b564

5 files changed

Lines changed: 132 additions & 9 deletions

File tree

dataframely/filter_result.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,29 @@ def invalid(self) -> pl.DataFrame:
114114
"""The rows of the original data frame containing the invalid rows."""
115115
return self._df.drop(self._rule_columns)
116116

117+
def details(self) -> pl.DataFrame:
118+
"""Same as :meth:`invalid` but with additional columns indicating the results of
119+
each individual rule.
120+
121+
For each row, this includes:
122+
1. All columns of the original data frame.
123+
2. One column for each rule indicating whether the value of the column
124+
is `valid`, `invalid`, or `unknown`.
125+
126+
If a rule column has a value of `unknown` for a given row, that means the rule
127+
could not be evaluated reliably.
128+
This may happen when calling :meth:`Collection.filter` with collection-level
129+
filters in addition to member-level rules, or when calling :meth:`Schema.filter`
130+
with `cast=True` and dtype-casting fails for a value.
131+
"""
132+
return self._df.select(
133+
pl.exclude(self._rule_columns),
134+
pl.col(*self._rule_columns).replace_strict(
135+
{True: "valid", False: "invalid", None: "unknown"},
136+
return_dtype=pl.Enum(["valid", "invalid", "unknown"]),
137+
),
138+
)
139+
117140
def counts(self) -> dict[str, int]:
118141
"""The number of validation failures for each individual rule.
119142

docs/guides/features/serialization.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ class HouseSchema(dy.Schema):
139139
price = dy.Float64(nullable=False)
140140

141141
@dy.rule()
142-
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
142+
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
143143
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
144144
return (ratio >= 1 / 3) & (ratio <= 3)
145145

@@ -190,9 +190,9 @@ json.loads(HouseSchema.serialize())
190190
'primary_key': False,
191191
'regex': None}},
192192
'name': 'HouseSchema',
193-
'rules': {'reasonable_bathroom_to_bedrooom_ratio': {'expr': {'__type__': 'expression',
194-
'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
195-
'rule_type': 'Rule'}},
193+
'rules': {'reasonable_bathroom_to_bedroom_ratio': {'expr': {'__type__': 'expression',
194+
'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
195+
'rule_type': 'Rule'}},
196196
'versions': {'dataframely': '2.0.0', 'format': '1', 'polars': '1.33.1'}}
197197
```
198198

docs/guides/quickstart.md

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,13 @@ class HouseSchema(dy.Schema):
5454
price = dy.Float64(nullable=False)
5555

5656
@dy.rule()
57-
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
57+
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
5858
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
5959
return (ratio >= 1 / 3) & (ratio <= 3)
6060
```
6161

6262
The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e.
63-
`reasonable_bathroom_to_bedrooom_ratio`).
63+
`reasonable_bathroom_to_bedroom_ratio`).
6464
The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data
6565
are valid with respect to this rule.
6666

@@ -81,7 +81,7 @@ class HouseSchema(dy.Schema):
8181
price = dy.Float64(nullable=False)
8282

8383
@dy.rule()
84-
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
84+
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
8585
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
8686
return (ratio >= 1 / 3) & (ratio <= 3)
8787

@@ -189,7 +189,7 @@ Using the `counts` method on the :class:`~dataframely.FailureInfo` object will r
189189

190190
```python
191191
{
192-
"reasonable_bathroom_to_bedrooom_ratio": 1,
192+
"reasonable_bathroom_to_bedroom_ratio": 1,
193193
"minimum_zip_code_count": 2,
194194
"zip_code|min_length": 1,
195195
"num_bedrooms|nullability": 2,
@@ -205,6 +205,19 @@ failed_df = failure.invalid()
205205
This information tends to be very useful in tracking down issues with the data,
206206
both in productive systems and analytics environments.
207207

208+
```{note}
209+
New in `dataframely` v2.8.0: The `FailureInfo.details()` method now returns additional columns indicating which rules were violated for each row.
210+
```
211+
212+
For the example above, `failure.details()` would look as follows (we omitted some columns for readability):
213+
214+
| zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... |
215+
| -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- |
216+
| 1 | 1 | 1 | 50000 | valid | invalid | invalid | valid | |
217+
| 213 | null | 1 | 80000 | valid | valid | valid | invalid | |
218+
| 123 | null | 0 | 60000 | valid | invalid | valid | invalid | |
219+
| 213 | 2 | 8 | 160000 | invalid | valid | valid | valid | |
220+
208221
## Type casting
209222

210223
In rare cases, you might already be _absolutely certain_ that a data frame is valid with
@@ -229,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2]))
229242
Lastly, `dataframely` schemas can be used to integrate with external tools:
230243

231244
- `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing
232-
- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to
245+
- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be
246+
used to
233247
create SQL tables using types and constraints in line with the schema
234248
- `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with
235249
appropriate column dtypes and nullability information

tests/collection/test_filter_validate.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,3 +304,57 @@ def test_maintain_order() -> None:
304304
out = MyShufflingCollection.validate(out.to_dict())
305305
assert out.first.select("a").collect().to_series().is_sorted()
306306
assert out.second.select("a").collect().to_series().is_sorted()
307+
308+
309+
def test_unknown_rule_outcomes(
310+
data_without_filter_with_rule_violation: tuple[pl.DataFrame, pl.DataFrame],
311+
) -> None:
312+
_, fails = MyCollection.filter(
313+
{
314+
"first": data_without_filter_with_rule_violation[0],
315+
"second": data_without_filter_with_rule_violation[1],
316+
}
317+
)
318+
assert fails["first"].details().to_dicts() == [
319+
{
320+
"a": 1,
321+
"b": 1,
322+
"a|nullability": "valid",
323+
"b|nullability": "valid",
324+
"equal_primary_key": "unknown",
325+
"first_b_greater_second_b": "unknown",
326+
"primary_key": "invalid",
327+
},
328+
{
329+
"a": 1,
330+
"b": 3,
331+
"a|nullability": "valid",
332+
"b|nullability": "valid",
333+
"equal_primary_key": "unknown",
334+
"first_b_greater_second_b": "unknown",
335+
"primary_key": "invalid",
336+
},
337+
]
338+
339+
assert fails["second"].details().to_dicts() == [
340+
{
341+
"a": 1,
342+
"b": 0,
343+
"primary_key": "valid",
344+
"a|nullability": "valid",
345+
"b|nullability": "valid",
346+
"b|min": "invalid",
347+
"equal_primary_key": "unknown",
348+
"first_b_greater_second_b": "unknown",
349+
},
350+
{
351+
"a": 3,
352+
"b": 2,
353+
"primary_key": "unknown",
354+
"a|nullability": "unknown",
355+
"b|nullability": "unknown",
356+
"b|min": "unknown",
357+
"equal_primary_key": "invalid",
358+
"first_b_greater_second_b": "valid",
359+
},
360+
]

tests/schema/test_filter.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,3 +243,35 @@ def test_filter_maintain_order(eager: bool) -> None:
243243
)
244244
out, _ = _filter_and_collect(schema, df, cast=True, eager=eager)
245245
assert out.get_column("a").is_sorted()
246+
247+
248+
@pytest.mark.parametrize("eager", [True, False])
249+
def test_filter_details(eager: bool) -> None:
250+
df = pl.DataFrame(
251+
{
252+
"a": [2, 2],
253+
"b": ["bar", "foobar"],
254+
}
255+
)
256+
_, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager)
257+
258+
assert fails.details().to_dicts() == [
259+
{
260+
"a": 2,
261+
"b": "bar",
262+
"a|dtype": "valid",
263+
"a|nullability": "valid",
264+
"b|dtype": "valid",
265+
"b|max_length": "valid",
266+
"primary_key": "invalid",
267+
},
268+
{
269+
"a": 2,
270+
"b": "foobar",
271+
"a|dtype": "valid",
272+
"a|nullability": "valid",
273+
"b|dtype": "valid",
274+
"b|max_length": "invalid",
275+
"primary_key": "invalid",
276+
},
277+
]

0 commit comments

Comments
 (0)