feat: Introduce pydantic conversion for schemas (#324)

Claude · borchero · web-flow · commit 4e92feaa68ce · 2026-04-13T10:00:29.000+02:00
Co-authored-by: anthropic-code-agent[bot] &lt;242468646+Claude@users.noreply.github.com&gt;
Co-authored-by: borchero &lt;22455425+borchero@users.noreply.github.com&gt;
Co-authored-by: Oliver Borchert &lt;oliver.borchert@quantco.com&gt;
diff --git a/dataframely/columns/_base.py b/dataframely/columns/_base.py
@@ -5,14 +5,15 @@
 
 import inspect
 import sys
+import warnings
 from abc import ABC, abstractmethod
 from collections import Counter
 from collections.abc import Callable, Mapping, Sequence
-from typing import Any, TypeAlias, cast
+from typing import Annotated, Any, TypeAlias, cast
 
 import polars as pl
 
-from dataframely._compat import pa, sa, sa_TypeEngine
+from dataframely._compat import pa, pydantic, sa, sa_TypeEngine
 from dataframely._polars import PolarsDataType
 from dataframely.random import Generator
 
@@ -222,6 +223,50 @@ def pyarrow_field(self, name: str) -> pa.Field:
     def pyarrow_dtype(self) -> pa.DataType:
         """The :mod:`pyarrow` dtype equivalent of this column data type."""
 
+    # ----------------------------------- PYDANTIC ----------------------------------- #
+
+    def pydantic_field(self) -> Any:
+        """Obtain a pydantic field type for this column definition.
+
+        Returns:
+            A pydantic-compatible type annotation that includes structured constraints
+            (such as `min`, `max`, ...).
+
+        Warning:
+            Custom checks are not translated to pydantic validators.
+        """
+        if self.check is not None:
+            warnings.warn(
+                f"Custom checks for column '{self.name or self.__class__.__name__}' "
+                "are not translated to pydantic constraints."
+            )
+
+        python_type = self._python_type
+        if self.nullable:
+            python_type = python_type | None
+
+        field_kwargs = self._pydantic_field_kwargs()
+        if field_kwargs:
+            return Annotated[python_type, pydantic.Field(**field_kwargs)]
+        return python_type
+
+    @property
+    @abstractmethod
+    def _python_type(self) -> Any:
+        """The native Python type corresponding to this column definition."""
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        """Return kwargs for pydantic.Field initialization.
+
+        This method should be extended by subclasses and mixins to add their
+        specific constraints. Subclasses should call super() and extend the
+        returned dictionary.
+
+        Returns:
+            A dictionary of kwargs to pass to pydantic.Field.
+        """
+        return {}
+
     # ------------------------------------ HELPER ------------------------------------ #
 
     @property
diff --git a/dataframely/columns/_mixins.py b/dataframely/columns/_mixins.py
@@ -80,6 +80,18 @@ def validation_rules(self, expr: pl.Expr) -> dict[str, pl.Expr]:
             result["max_exclusive"] = expr < self.max_exclusive  # type: ignore
         return result
 
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if self.min is not None:
+            kwargs["ge"] = self.min
+        if self.min_exclusive is not None:
+            kwargs["gt"] = self.min_exclusive
+        if self.max is not None:
+            kwargs["le"] = self.max
+        if self.max_exclusive is not None:
+            kwargs["lt"] = self.max_exclusive
+        return kwargs
+
 
 # ------------------------------------ IS IN MIXIN ----------------------------------- #
 
diff --git a/dataframely/columns/any.py b/dataframely/columns/any.py
@@ -3,6 +3,8 @@
 
 from __future__ import annotations
 
+from typing import Any as AnyType
+
 import polars as pl
 
 from dataframely._compat import pa, sa, sa_mssql, sa_TypeEngine
@@ -77,5 +79,9 @@ def pyarrow_field(self, name: str) -> pa.Field:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.null()
 
+    @property
+    def _python_type(self) -> AnyType:
+        return AnyType
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return pl.repeat(None, n, dtype=pl.Null, eager=True)
diff --git a/dataframely/columns/array.py b/dataframely/columns/array.py
@@ -5,6 +5,7 @@
 
 import math
 import sys
+import warnings
 from collections.abc import Sequence
 from typing import Any, Literal, cast
 
@@ -121,6 +122,23 @@ def _pyarrow_field_of_shape(self, shape: Sequence[int]) -> pa.Field:
     def pyarrow_dtype(self) -> pa.DataType:
         return self._pyarrow_field_of_shape(self.shape).type
 
+    @property
+    def _python_type(self) -> Any:
+        inner_type = self.inner.pydantic_field()
+        return list[inner_type]  # type: ignore
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if len(self.shape) != 1:
+            warnings.warn(
+                "Multi-dimensional arrays are flattened for pydantic validation."
+            )
+
+        return {
+            **super()._pydantic_field_kwargs(),
+            "min_length": math.prod(self.shape),
+            "max_length": math.prod(self.shape),
+        }
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # Sample the inner elements in a flat series
         n_elements = n * math.prod(self.shape)
diff --git a/dataframely/columns/binary.py b/dataframely/columns/binary.py
@@ -3,6 +3,8 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 import polars as pl
 
 from dataframely._compat import pa, sa, sa_TypeEngine
@@ -31,6 +33,10 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.large_binary()
 
+    @property
+    def _python_type(self) -> Any:
+        return bytes
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_binary(
             n,
diff --git a/dataframely/columns/bool.py b/dataframely/columns/bool.py
@@ -3,6 +3,8 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 import polars as pl
 
 from dataframely._compat import pa, sa, sa_TypeEngine
@@ -27,5 +29,9 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.bool_()
 
+    @property
+    def _python_type(self) -> Any:
+        return bool
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_bool(n, null_probability=self._null_probability)
diff --git a/dataframely/columns/categorical.py b/dataframely/columns/categorical.py
@@ -71,6 +71,10 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.dictionary(pa.uint32(), pa.large_string())
 
+    @property
+    def _python_type(self) -> Any:
+        return str
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # We simply sample low-cardinality strings here
         return generator.sample_string(
diff --git a/dataframely/columns/datetime.py b/dataframely/columns/datetime.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import datetime as dt
+import warnings
 from typing import Any, cast
 
 import polars as pl
@@ -132,6 +133,16 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.date32()
 
+    @property
+    def _python_type(self) -> Any:
+        return dt.date
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn("Date resolution is not translated to a pydantic constraint.")
+
+        return super()._pydantic_field_kwargs()
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_date(
             n,
@@ -261,6 +272,16 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.time64("ns")
 
+    @property
+    def _python_type(self) -> Any:
+        return dt.time
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn("Time resolution is not translated to a pydantic constraint.")
+
+        return super()._pydantic_field_kwargs()
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_time(
             n,
@@ -394,6 +415,22 @@ def pyarrow_dtype(self) -> pa.DataType:
         )
         return pa.timestamp(self.time_unit, time_zone)
 
+    @property
+    def _python_type(self) -> Any:
+        return dt.datetime
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn(
+                "Datetime resolution is not translated to a pydantic constraint."
+            )
+        if self.time_zone is not None:
+            warnings.warn(
+                "Datetime time zone is not translated to a pydantic constraint."
+            )
+
+        return super()._pydantic_field_kwargs()
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_datetime(
             n,
@@ -531,6 +568,18 @@ def sqlalchemy_dtype(self, dialect: sa.Dialect) -> sa_TypeEngine:
     def pyarrow_dtype(self) -> pa.DataType:
         return pa.duration(self.time_unit)
 
+    @property
+    def _python_type(self) -> Any:
+        return dt.timedelta
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.resolution is not None:
+            warnings.warn(
+                "Duration resolution is not translated to a pydantic constraint."
+            )
+
+        return super()._pydantic_field_kwargs()
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # NOTE: If no duration is specified, we default to 100 years
         return generator.sample_duration(
diff --git a/dataframely/columns/decimal.py b/dataframely/columns/decimal.py
@@ -128,6 +128,16 @@ def pyarrow_dtype(self) -> pa.DataType:
         # We do not use decimal256 since its values cannot be represented in SQL Server.
         return pa.decimal128(self.precision or 38, self.scale)
 
+    @property
+    def _python_type(self) -> Any:
+        return decimal.Decimal
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        return {
+            **super()._pydantic_field_kwargs(),
+            "decimal_places": self.scale,
+        }
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # NOTE: Default precision to 38 for sampling, just like for SQL and Pyarrow
         precision = self.precision or 38
diff --git a/dataframely/columns/enum.py b/dataframely/columns/enum.py
@@ -6,7 +6,7 @@
 import enum
 from collections.abc import Iterable
 from inspect import isclass
-from typing import Any
+from typing import Any, Literal
 
 import polars as pl
 
@@ -95,6 +95,10 @@ def pyarrow_dtype(self) -> pa.DataType:
             dtype = pa.uint32()
         return pa.dictionary(dtype, pa.large_string())
 
+    @property
+    def _python_type(self) -> Any:
+        return Literal[tuple(self.categories)]
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         return generator.sample_choice(
             n,
diff --git a/dataframely/columns/float.py b/dataframely/columns/float.py
@@ -5,6 +5,7 @@
 
 import math
 import sys
+import warnings
 from abc import abstractmethod
 from typing import Any
 
@@ -101,6 +102,26 @@ def max_value(self) -> float:
     def min_value(self) -> float:
         """Minimum value of the column's type."""
 
+    @property
+    def _python_type(self) -> Any:
+        return float
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        if self.allow_inf != self.allow_nan:
+            warnings.warn(
+                "Unequal settings of `allow_inf` and `allow_nan` cannot be translated to "
+                "pydantic constraints."
+            )
+
+        kwargs = super()._pydantic_field_kwargs()
+        if self.allow_inf == self.allow_nan:
+            kwargs["allow_inf_nan"] = self.allow_inf
+        if "le" not in kwargs:
+            kwargs["le"] = self.max_value
+        if "ge" not in kwargs:
+            kwargs["ge"] = self.min_value
+        return kwargs
+
     @property
     def _nan_probability(self) -> float:
         """Private utility for the null probability used during sampling."""
diff --git a/dataframely/columns/integer.py b/dataframely/columns/integer.py
@@ -5,7 +5,7 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
-from typing import Any
+from typing import Any, Literal
 
 import polars as pl
 from polars.datatypes.group import INTEGER_DTYPES
@@ -114,6 +114,20 @@ def min_value(self) -> int:
         """Minimum value of the column's type."""
         return 0 if self.is_unsigned else -(2 ** (self.num_bytes * 8 - 1))
 
+    @property
+    def _python_type(self) -> Any:
+        if self.is_in is not None:
+            return Literal[tuple(self.is_in)]
+        return int
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if "le" not in kwargs:
+            kwargs["le"] = self.max_value
+        if "ge" not in kwargs:
+            kwargs["ge"] = self.min_value
+        return kwargs
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         if self.is_in is not None:
             return generator.sample_choice(
diff --git a/dataframely/columns/list.py b/dataframely/columns/list.py
@@ -133,6 +133,19 @@ def pyarrow_dtype(self) -> pa.DataType:
         # NOTE: Polars uses `large_list`s by default.
         return pa.large_list(self.inner.pyarrow_field("item"))
 
+    @property
+    def _python_type(self) -> Any:
+        inner_type = self.inner.pydantic_field()
+        return list[inner_type]  # type: ignore
+
+    def _pydantic_field_kwargs(self) -> dict[str, Any]:
+        kwargs = super()._pydantic_field_kwargs()
+        if self.min_length is not None:
+            kwargs["min_length"] = self.min_length
+        if self.max_length is not None:
+            kwargs["max_length"] = self.max_length
+        return kwargs
+
     def _sample_unchecked(self, generator: Generator, n: int) -> pl.Series:
         # First, sample the number of items per list element
         # NOTE: We default to 32 for the upper bound as we need some kind of reasonable
diff --git a/dataframely/columns/object.py b/dataframely/columns/object.py
diff --git a/dataframely/columns/string.py b/dataframely/columns/string.py
diff --git a/dataframely/columns/struct.py b/dataframely/columns/struct.py
diff --git a/dataframely/schema.py b/dataframely/schema.py
diff --git a/tests/columns/test_pydantic.py b/tests/columns/test_pydantic.py
diff --git a/tests/schema/test_pydantic_model.py b/tests/schema/test_pydantic_model.py