datafold
diff --git a/‎data_diff/__main__.py‎
Lines changed: 3 additions & 3 deletions b/‎data_diff/__main__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎data_diff/databases/base.py‎
Lines changed: 34 additions & 26 deletions b/‎data_diff/databases/base.py‎
Lines changed: 34 additions & 26 deletions
diff --git a/‎data_diff/databases/bigquery.py‎
Lines changed: 7 additions & 12 deletions b/‎data_diff/databases/bigquery.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎data_diff/databases/clickhouse.py‎
Lines changed: 12 additions & 10 deletions b/‎data_diff/databases/clickhouse.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎data_diff/databases/databricks.py‎
Lines changed: 25 additions & 17 deletions b/‎data_diff/databases/databricks.py‎
Lines changed: 25 additions & 17 deletions
diff --git a/‎data_diff/databases/duckdb.py‎
Lines changed: 4 additions & 11 deletions b/‎data_diff/databases/duckdb.py‎
Lines changed: 4 additions & 11 deletions
@@ -12,8 +12,8 @@
 from rich.logging import RichHandler
 import click
 
-from data_diff import Database
-from data_diff.schema import create_schema
+from data_diff import Database, DbPath
+from data_diff.schema import RawColumnInfo, create_schema
 from data_diff.queries.api import current_timestamp
 
 from data_diff.dbt import dbt_diff
@@ -72,7 +72,7 @@ def _remove_passwords_in_dict(d: dict) -> None:
             d[k] = remove_password_from_url(v)
 
 
-def _get_schema(pair):
+def _get_schema(pair: Tuple[Database, DbPath]) -> Dict[str, RawColumnInfo]:
     db, table_path = pair
     return db.query_table_schema(table_path)
 
 
@@ -19,6 +19,7 @@
 
 from data_diff.abcs.compiler import AbstractCompiler, Compilable
 from data_diff.queries.extras import ApplyFuncAndNormalizeAsString, Checksum, NormalizeAsString
+from data_diff.schema import RawColumnInfo
 from data_diff.utils import ArithString, is_uuid, join_iter, safezip
 from data_diff.queries.api import Expr, table, Select, SKIP, Explain, Code, this
 from data_diff.queries.ast_classes import (
@@ -707,27 +708,18 @@ def type_repr(self, t) -> str:
             datetime: "TIMESTAMP",
         }[t]
 
-    def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
-        return self.TYPE_CLASSES.get(type_repr)
-
-    def parse_type(
-        self,
-        table_path: DbPath,
-        col_name: str,
-        type_repr: str,
-        datetime_precision: int = None,
-        numeric_precision: int = None,
-        numeric_scale: int = None,
-    ) -> ColType:
+    def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         "Parse type info as returned by the database"
 
-        cls = self._parse_type_repr(type_repr)
+        cls = self.TYPE_CLASSES.get(info.type_repr)
         if cls is None:
-            return UnknownColType(type_repr)
+            return UnknownColType(info.type_repr)
 
         if issubclass(cls, TemporalType):
             return cls(
-                precision=datetime_precision if datetime_precision is not None else DEFAULT_DATETIME_PRECISION,
+                precision=info.datetime_precision
+                if info.datetime_precision is not None
+                else DEFAULT_DATETIME_PRECISION,
                 rounds=self.ROUNDS_ON_PREC_LOSS,
             )
 
@@ -738,22 +730,22 @@ def parse_type(
             return cls()
 
         elif issubclass(cls, Decimal):
-            if numeric_scale is None:
-                numeric_scale = 0  # Needed for Oracle.
-            return cls(precision=numeric_scale)
+            if info.numeric_scale is None:
+                return cls(precision=0)  # Needed for Oracle.
+            return cls(precision=info.numeric_scale)
 
         elif issubclass(cls, Float):
             # assert numeric_scale is None
             return cls(
                 precision=self._convert_db_precision_to_digits(
-                    numeric_precision if numeric_precision is not None else DEFAULT_NUMERIC_PRECISION
+                    info.numeric_precision if info.numeric_precision is not None else DEFAULT_NUMERIC_PRECISION
                 )
             )
 
         elif issubclass(cls, (JSON, Array, Struct, Text, Native_UUID)):
             return cls()
 
-        raise TypeError(f"Parsing {type_repr} returned an unknown type '{cls}'.")
+        raise TypeError(f"Parsing {info.type_repr} returned an unknown type {cls!r}.")
 
     def _convert_db_precision_to_digits(self, p: int) -> int:
         """Convert from binary precision, used by floats, to decimal precision."""
@@ -1018,7 +1010,7 @@ def select_table_schema(self, path: DbPath) -> str:
             f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
         )
 
-    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
+    def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
         """Query the table for its schema for table in 'path', and return {column: tuple}
         where the tuple is (table_name, col_name, type_repr, datetime_precision?, numeric_precision?, numeric_scale?)
 
@@ -1029,7 +1021,17 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
         if not rows:
             raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
 
-        d = {r[0]: r for r in rows}
+        d = {
+            r[0]: RawColumnInfo(
+                column_name=r[0],
+                type_repr=r[1],
+                datetime_precision=r[2],
+                numeric_precision=r[3],
+                numeric_scale=r[4],
+                collation_name=r[5] if len(r) > 5 else None,
+            )
+            for r in rows
+        }
         assert len(d) == len(rows)
         return d
 
@@ -1051,7 +1053,11 @@ def query_table_unique_columns(self, path: DbPath) -> List[str]:
         return list(res)
 
     def _process_table_schema(
-        self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str] = None, where: str = None
+        self,
+        path: DbPath,
+        raw_schema: Dict[str, RawColumnInfo],
+        filter_columns: Sequence[str] = None,
+        where: str = None,
     ):
         """Process the result of query_table_schema().
 
@@ -1067,7 +1073,7 @@ def _process_table_schema(
             accept = {i.lower() for i in filter_columns}
             filtered_schema = {name: row for name, row in raw_schema.items() if name.lower() in accept}
 
-        col_dict = {row[0]: self.dialect.parse_type(path, *row) for _name, row in filtered_schema.items()}
+        col_dict = {info.column_name: self.dialect.parse_type(path, info) for info in filtered_schema.values()}
 
         self._refine_coltypes(path, col_dict, where)
 
@@ -1076,15 +1082,15 @@ def _process_table_schema(
 
     def _refine_coltypes(
         self, table_path: DbPath, col_dict: Dict[str, ColType], where: Optional[str] = None, sample_size=64
-    ):
+    ) -> Dict[str, ColType]:
         """Refine the types in the column dict, by querying the database for a sample of their values
 
         'where' restricts the rows to be sampled.
         """
 
         text_columns = [k for k, v in col_dict.items() if isinstance(v, Text)]
         if not text_columns:
-            return
+            return col_dict
 
         fields = [Code(self.dialect.normalize_uuid(self.dialect.quote(c), String_UUID())) for c in text_columns]
 
@@ -1118,6 +1124,8 @@ def _refine_coltypes(
                         assert col_name in col_dict
                         col_dict[col_name] = String_VaryingAlphanum()
 
+        return col_dict
+
     def _normalize_table_path(self, path: DbPath) -> DbPath:
         if len(path) == 1:
             return self.default_schema, path[0]
 
@@ -33,6 +33,7 @@
     MD5_HEXDIGITS,
 )
 from data_diff.databases.base import TIMESTAMP_PRECISION_POS, ThreadLocalInterpreter
+from data_diff.schema import RawColumnInfo
 
 
 @import_helper(text="Please install BigQuery and configure your google-cloud access.")
@@ -91,27 +92,21 @@ def type_repr(self, t) -> str:
         except KeyError:
             return super().type_repr(t)
 
-    def parse_type(
-        self,
-        table_path: DbPath,
-        col_name: str,
-        type_repr: str,
-        *args: Any,  # pass-through args
-        **kwargs: Any,  # pass-through args
-    ) -> ColType:
-        col_type = super().parse_type(table_path, col_name, type_repr, *args, **kwargs)
+    def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
+        col_type = super().parse_type(table_path, info)
         if isinstance(col_type, UnknownColType):
-            m = self.TYPE_ARRAY_RE.fullmatch(type_repr)
+            m = self.TYPE_ARRAY_RE.fullmatch(info.type_repr)
             if m:
-                item_type = self.parse_type(table_path, col_name, m.group(1), *args, **kwargs)
+                item_info = attrs.evolve(info, data_type=m.group(1))
+                item_type = self.parse_type(table_path, item_info)
                 col_type = Array(item_type=item_type)
 
             # We currently ignore structs' structure, but later can parse it too. Examples:
             # - STRUCT<INT64, STRING(10)> (unnamed)
             # - STRUCT<foo INT64, bar STRING(10)> (named)
             # - STRUCT<foo INT64, bar ARRAY<INT64>> (with complex fields)
             # - STRUCT<foo INT64, bar STRUCT<a INT64, b INT64>> (nested)
-            m = self.TYPE_STRUCT_RE.fullmatch(type_repr)
+            m = self.TYPE_STRUCT_RE.fullmatch(info.type_repr)
             if m:
                 col_type = Struct()
 
 
@@ -14,6 +14,7 @@
 )
 from data_diff.abcs.database_types import (
     ColType,
+    DbPath,
     Decimal,
     Float,
     Integer,
@@ -24,6 +25,7 @@
     Timestamp,
     Boolean,
 )
+from data_diff.schema import RawColumnInfo
 
 # https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings/#default-database
 DEFAULT_DATABASE = "default"
@@ -75,19 +77,19 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
         # because it does not help for float with a big integer part.
         return super()._convert_db_precision_to_digits(p) - 2
 
-    def _parse_type_repr(self, type_repr: str) -> Optional[Type[ColType]]:
+    def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         nullable_prefix = "Nullable("
-        if type_repr.startswith(nullable_prefix):
-            type_repr = type_repr[len(nullable_prefix) :].rstrip(")")
+        if info.type_repr.startswith(nullable_prefix):
+            info = attrs.evolve(info, data_type=info.type_repr[len(nullable_prefix) :].rstrip(")"))
 
-        if type_repr.startswith("Decimal"):
-            type_repr = "Decimal"
-        elif type_repr.startswith("FixedString"):
-            type_repr = "FixedString"
-        elif type_repr.startswith("DateTime64"):
-            type_repr = "DateTime64"
+        if info.type_repr.startswith("Decimal"):
+            info = attrs.evolve(info, data_type="Decimal")
+        elif info.type_repr.startswith("FixedString"):
+            info = attrs.evolve(info, data_type="FixedString")
+        elif info.type_repr.startswith("DateTime64"):
+            info = attrs.evolve(info, data_type="DateTime64")
 
-        return self.TYPE_CLASSES.get(type_repr)
+        return super().parse_type(table_path, info)
 
     # def timestamp_value(self, t: DbTime) -> str:
     #     # return f"'{t}'"
 
@@ -26,6 +26,7 @@
     import_helper,
     parse_table_name,
 )
+from data_diff.schema import RawColumnInfo
 
 
 @import_helper(text="You can install it using 'pip install databricks-sql-connector'")
@@ -138,7 +139,7 @@ def create_connection(self):
         except databricks.sql.exc.Error as e:
             raise ConnectionError(*e.args) from e
 
-    def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
+    def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
         # Databricks has INFORMATION_SCHEMA only for Databricks Runtime, not for Databricks SQL.
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/information-schema/columns.html
         # So, to obtain information about schema, we should use another approach.
@@ -155,7 +156,12 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
             if not rows:
                 raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
 
-            d = {r.COLUMN_NAME: (r.COLUMN_NAME, r.TYPE_NAME, r.DECIMAL_DIGITS, None, None) for r in rows}
+            d = {
+                r.COLUMN_NAME: RawColumnInfo(
+                    column_name=r.COLUMN_NAME, type_repr=r.TYPE_NAME, datetime_precision=r.DECIMAL_DIGITS
+                )
+                for r in rows
+            }
             assert len(d) == len(rows)
             return d
 
@@ -173,37 +179,39 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
     #     )
 
     def _process_table_schema(
-        self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None
+        self, path: DbPath, raw_schema: Dict[str, RawColumnInfo], filter_columns: Sequence[str], where: str = None
     ):
         accept = {i.lower() for i in filter_columns}
-        rows = [row for name, row in raw_schema.items() if name.lower() in accept]
+        col_infos = [row for name, row in raw_schema.items() if name.lower() in accept]
 
         resulted_rows = []
-        for row in rows:
-            row_type = "DECIMAL" if row[1].startswith("DECIMAL") else row[1]
+        for info in col_infos:
+            row_type = "DECIMAL" if info.type_repr.startswith("DECIMAL") else info.type_repr
+            info = attrs.evolve(info, type_repr=row_type)
             type_cls = self.dialect.TYPE_CLASSES.get(row_type, UnknownColType)
 
             if issubclass(type_cls, Integer):
-                row = (row[0], row_type, None, None, 0)
+                info = attrs.evolve(info, numeric_scale=0)
 
             elif issubclass(type_cls, Float):
-                numeric_precision = math.ceil(row[2] / math.log(2, 10))
-                row = (row[0], row_type, None, numeric_precision, None)
+                numeric_precision = math.ceil(info[2] / math.log(2, 10))
+                info = attrs.evolve(info, numeric_precision=numeric_precision)
 
             elif issubclass(type_cls, Decimal):
-                items = row[1][8:].rstrip(")").split(",")
+                items = info.type_repr[8:].rstrip(")").split(",")
                 numeric_precision, numeric_scale = int(items[0]), int(items[1])
-                row = (row[0], row_type, None, numeric_precision, numeric_scale)
+                info = attrs.evolve(
+                    info,
+                    numeric_precision=numeric_precision,
+                    numeric_scale=numeric_scale,
+                )
 
             elif issubclass(type_cls, Timestamp):
-                row = (row[0], row_type, row[2], None, None)
+                info = attrs.evolve(info, datetime_precision=info.datetime_precision)
 
-            else:
-                row = (row[0], row_type, None, None, None)
+            resulted_rows.append(info)
 
-            resulted_rows.append(row)
-
-        col_dict: Dict[str, ColType] = {row[0]: self.dialect.parse_type(path, *row) for row in resulted_rows}
+        col_dict: Dict[str, ColType] = {info.column_name: self.dialect.parse_type(path, info) for info in resulted_rows}
 
         self._refine_coltypes(path, col_dict, where)
         return col_dict
 
@@ -3,6 +3,7 @@
 import attrs
 from packaging.version import parse as parse_version
 
+from data_diff.schema import RawColumnInfo
 from data_diff.utils import match_regexps
 from data_diff.abcs.database_types import (
     Timestamp,
@@ -74,24 +75,16 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
         # Subtracting 2 due to wierd precision issues in PostgreSQL
         return super()._convert_db_precision_to_digits(p) - 2
 
-    def parse_type(
-        self,
-        table_path: DbPath,
-        col_name: str,
-        type_repr: str,
-        datetime_precision: int = None,
-        numeric_precision: int = None,
-        numeric_scale: int = None,
-    ) -> ColType:
+    def parse_type(self, table_path: DbPath, info: RawColumnInfo) -> ColType:
         regexps = {
             r"DECIMAL\((\d+),(\d+)\)": Decimal,
         }
 
-        for m, t_cls in match_regexps(regexps, type_repr):
+        for m, t_cls in match_regexps(regexps, info.type_repr):
             precision = int(m.group(2))
             return t_cls(precision=precision)
 
-        return super().parse_type(table_path, col_name, type_repr, datetime_precision, numeric_precision, numeric_scale)
+        return super().parse_type(table_path, info)
 
     def set_timezone_to_utc(self) -> str:
         return "SET GLOBAL TimeZone='UTC'"