diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index f4882f669..91ccb3c40 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -63,8 +63,21 @@ jobs: install-args: "--all-extras" - name: Run all tests with coverage continue-on-error: false + # This job installs --all-extras, so the REAL databricks-sql-kernel + # wheel is present. The unit suite fakes databricks_sql_kernel in + # sys.modules, which would shadow the real wheel in this shared + # session — so the kernel-backed suites that need the real wheel + # are excluded here and covered by the dedicated kernel-e2e.yml + # (isolated session, real wheel, live warehouse): + # - --ignore the kernel e2e files (they assert the real wheel and + # now FAIL LOUDLY rather than silently skip if shadowed), and + # - -m "not realkernel" deselects the no-network real-wheel + # routing test for the same reason. run: | poetry run pytest tests/unit tests/e2e \ + --ignore=tests/e2e/test_kernel_backend.py \ + --ignore=tests/e2e/test_kernel_tls.py \ + -m "not realkernel" \ -n 4 \ --dist=loadgroup \ --cov=src \ diff --git a/.github/workflows/code-quality-checks.yml b/.github/workflows/code-quality-checks.yml index 4071a6e51..bb4db5b83 100644 --- a/.github/workflows/code-quality-checks.yml +++ b/.github/workflows/code-quality-checks.yml @@ -48,7 +48,7 @@ jobs: echo "=== Dependency Version: ${{ matrix.dependency-version }} ===" poetry run pip list - name: Run tests - run: poetry run python -m pytest tests/unit + run: poetry run python -m pytest tests/unit -m "not realkernel" run-unit-tests-with-arrow: runs-on: @@ -77,7 +77,11 @@ jobs: uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} - install-args: "--all-extras" + # Install ONLY the pyarrow extra (not --all-extras) so this + # tier isolates the "pyarrow present, kernel absent" + # configuration. --all-extras would also pull the kernel wheel, + # making this job redundant with "Unit Tests + Kernel". + install-args: "--extras pyarrow" cache-suffix: "pyarrow-${{ matrix.dependency-version }}-" - name: Install Python tools for custom versions if: matrix.dependency-version != 'default' @@ -96,7 +100,57 @@ jobs: echo "=== Dependency Version: ${{ matrix.dependency-version }} with PyArrow ===" poetry run pip list - name: Run tests - run: poetry run python -m pytest tests/unit + run: poetry run python -m pytest tests/unit -m "not realkernel" + + run-unit-tests-with-kernel: + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + strategy: + matrix: + # Kernel wheel is cp310-abi3 (Requires-Python >=3.10), so this + # matrix omits 3.9 — the [kernel] extra is a no-op there. + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + name: "Unit Tests + Kernel (Python ${{ matrix.python-version }})" + + steps: + - name: Check out repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - name: Install Kerberos system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libkrb5-dev + - name: Setup Poetry + uses: ./.github/actions/setup-poetry + with: + python-version: ${{ matrix.python-version }} + # Install the kernel extra (pulls the published + # databricks-sql-kernel wheel, which transitively brings + # pyarrow). Explicit --extras kernel rather than --all-extras + # so this tier targets the kernel configuration specifically. + install-args: "--extras kernel" + cache-suffix: "kernel-" + - name: Show installed versions + run: | + echo "=== with databricks-sql-kernel ===" + poetry run pip list + - name: Assert the real kernel wheel is installed (not a stub) + run: | + poetry run python -c "import databricks_sql_kernel as k; assert k.__file__, 'kernel wheel missing __file__ — not the real wheel'; print('real kernel wheel:', k.__file__)" + - name: Unit tests (kernel wheel present, realkernel deselected) + # The bulk of tests/unit fakes databricks_sql_kernel in + # sys.modules, so the real-wheel routing test is deselected here + # and run on its own below (a shared session would shadow the + # real wheel — both real-wheel tests fail loudly if that happens). + run: poetry run python -m pytest tests/unit -m "not realkernel" + - name: Drive use_kernel=True through the REAL wheel (routing) + # Separate invocation, explicit file path: never collects the + # fake-module test file, so sys.modules stays unpolluted. This is + # the no-network proof that sql.connect(use_kernel=True) actually + # instantiates the real KernelDatabricksClient (not a stub, not a + # Thrift fallback). Fails loudly if the real wheel is shadowed. + run: poetry run python -m pytest tests/unit/test_session.py -m realkernel -v check-linting: runs-on: diff --git a/README.md b/README.md index 047515ba4..dcd726b9b 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,30 @@ Install using `pip install databricks-sql-connector` ### Installing the core library with PyArrow Install using `pip install databricks-sql-connector[pyarrow]` +### Installing with the Rust kernel backend (`use_kernel=True`) +Install using `pip install databricks-sql-connector[kernel]` + +This adds the optional [`databricks-sql-kernel`](https://pypi.org/project/databricks-sql-kernel/) +extension (a native Rust client core, exposed via PyO3). Pass +`use_kernel=True` to `sql.connect(...)` to route the connection through it +instead of the default Thrift backend: + +```python +connection = sql.connect( + server_hostname=host, + http_path=http_path, + access_token=token, + use_kernel=True, +) +``` + +Notes: +- Requires **Python >= 3.10** (the kernel wheel is published as + `cp310-abi3`). On older interpreters the `[kernel]` extra installs + nothing and `use_kernel=True` raises an `ImportError`. +- The extra also pulls in PyArrow, which the kernel result path requires. +- Authentication supports PAT (`access_token`), OAuth M2M, and OAuth U2M. + ```bash export DATABRICKS_HOST=********.databricks.com diff --git a/pyproject.toml b/pyproject.toml index 2fa42e02b..212a7d17d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,31 +25,56 @@ openpyxl = "^3.0.10" urllib3 = ">=1.26" python-dateutil = "^2.8.0" pyarrow = [ - { version = ">=14.0.1", python = ">=3.8,<3.13", optional=true }, + # The <3.10 band is capped at <23 because pyarrow>=23 dropped + # Python 3.9 (it requires >=3.10). Without the cap, poetry tries to + # unify this entry with the kernel's transitive pyarrow>=23.0.1,<24 + # across the 3.8–3.10 slice and `poetry lock` fails ("pyarrow is + # forbidden"). The cap removes no installable version — the newest + # pyarrow with a 3.9 wheel is 21.x — it just makes that explicit to + # the solver so the optional [kernel] extra (python>=3.10) can + # coexist. See the kernel dep + [kernel] extra below. + { version = ">=14.0.1,<23", python = ">=3.8,<3.10", optional=true }, + { version = ">=14.0.1", python = ">=3.10,<3.13", optional=true }, { version = ">=18.0.0", python = ">=3.13,<3.14", optional=true }, { version = ">=22.0.0", python = ">=3.14", optional=true } ] pyjwt = "^2.0.0" pybreaker = "^1.0.0" requests-kerberos = {version = "^0.15.0", optional = true} +# Optional Rust kernel backend for ``use_kernel=True`` (PyO3 wheel). +# Pulled in only via the ``[kernel]`` extra below. The published wheel +# is ``abi3`` with ``Requires-Python: >=3.10`` (built ``abi3-py310``), +# so the dependency is gated to Python >= 3.10: on 3.8/3.9 the +# ``[kernel]`` extra resolves to nothing and ``use_kernel=True`` raises +# a clear ImportError at runtime (see backend/kernel/_errors.py). +# +# Floor is 0.2.0 (``^0.2.0`` == ``>=0.2.0,<0.3.0``). The kernel is +# pre-1.0, so each 0.x minor may carry breaking changes — the ``<0.3.0`` +# cap means we bump this deliberately when the kernel ships 0.3.0 rather +# than letting a potentially-breaking minor flow in automatically. 0.2.0 +# keeps the same Requires-Python (>=3.10) and pyarrow (>=23.0.1,<24) pin +# as 0.1.x, so the gating below is unchanged. +databricks-sql-kernel = {version = "^0.2.0", optional = true, python = ">=3.10"} [tool.poetry.extras] pyarrow = ["pyarrow"] -# `[kernel]` extra is intentionally not declared here yet. -# `databricks-sql-kernel` is built from the databricks-sql-kernel -# repo and not yet published to PyPI; declaring it as a poetry dep -# breaks `poetry lock` for every CI job. Once the wheel is on PyPI -# the extra will be added back here: -# -# databricks-sql-kernel = {version = "^0.1.0", optional = true} -# [tool.poetry.extras] -# kernel = ["databricks-sql-kernel"] +# ``pip install databricks-sql-connector[kernel]`` adds the Rust kernel +# backend so ``use_kernel=True`` works. No-op on Python < 3.10 (the +# wheel's floor) — those users get a runtime ImportError if they pass +# ``use_kernel=True``. # -# Until then, the wheel is not on PyPI and the only supported -# install path is local dev: -# cd databricks-sql-kernel/pyo3 && maturin develop --release -# (into the same venv as databricks-sql-connector). +# The kernel result path (``backend/kernel/result_set.py``) needs +# pyarrow, but it is NOT listed in this extra on purpose: the published +# kernel wheel declares ``pyarrow>=23.0.1,<24`` as a hard runtime +# dependency, so ``pip install ...[kernel]`` already pulls a compatible +# pyarrow transitively. Listing bare ``pyarrow`` here additionally +# forces poetry to co-resolve an unconstrained pyarrow against the +# kernel's ``>=23.0.1,<24`` (which itself requires Python >=3.10) across +# the connector's full 3.8–3.14 support matrix, which is unsatisfiable +# on 3.8/3.9 and breaks ``poetry lock``. The kernel's own dependency +# metadata is the single source of truth for the pyarrow floor. +kernel = ["databricks-sql-kernel"] [tool.poetry.group.dev.dependencies] pytest = "^7.1.2" @@ -82,7 +107,8 @@ exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck [tool.pytest.ini_options] markers = [ "reviewed: Test case has been reviewed by Databricks", - "serial: Tests that must run serially (not parallelized)" + "serial: Tests that must run serially (not parallelized)", + "realkernel: Requires the real databricks-sql-kernel wheel and an unpolluted sys.modules (no fake kernel stub); must run in a separate pytest invocation from tests that fake databricks_sql_kernel (deselect with -m 'not realkernel', run alone with -m realkernel).", ] minversion = "6.0" log_cli = "false" diff --git a/src/databricks/sql/backend/kernel/_errors.py b/src/databricks/sql/backend/kernel/_errors.py index 334866a37..bcf3f56b1 100644 --- a/src/databricks/sql/backend/kernel/_errors.py +++ b/src/databricks/sql/backend/kernel/_errors.py @@ -52,11 +52,13 @@ import databricks_sql_kernel as _kernel # type: ignore[import-not-found] except ImportError as exc: # pragma: no cover - same hint as client.py raise ImportError( - "use_kernel=True requires the databricks-sql-kernel extension, which " - "is not yet published on PyPI. Build and install it locally from the " - "databricks-sql-kernel repo:\n" - " cd databricks-sql-kernel/pyo3 && maturin develop --release\n" - "(into the same venv as databricks-sql-connector)." + "use_kernel=True requires the optional databricks-sql-kernel " + "extension, which is not installed. Install it with:\n" + ' pip install "databricks-sql-connector[kernel]"\n' + "The kernel wheel requires Python >= 3.10; on older interpreters " + "use_kernel is unavailable. For local kernel development you can " + "instead build it from the databricks-sql-kernel repo:\n" + " cd databricks-sql-kernel/pyo3 && maturin develop --release" ) from exc # Route the kernel's Rust-side logs into Python's ``logging`` as soon as diff --git a/tests/e2e/test_kernel_backend.py b/tests/e2e/test_kernel_backend.py index 4c822caa8..8b532a56a 100644 --- a/tests/e2e/test_kernel_backend.py +++ b/tests/e2e/test_kernel_backend.py @@ -21,6 +21,7 @@ from __future__ import annotations +import sys from uuid import uuid4 import pytest @@ -34,24 +35,59 @@ ServerOperationError, ) -# Skip the whole module unless the kernel wheel is genuinely installed. -# ``pytest.importorskip`` alone isn't enough: the kernel unit tests inject a -# fake ``databricks_sql_kernel`` ModuleType into ``sys.modules`` so the -# connector's import-time ``import databricks_sql_kernel`` succeeds without -# the Rust extension. In the same pytest session that fake module is still -# in ``sys.modules`` when this e2e file is collected, and importorskip -# happily returns it. A real wheel exposes ``__file__`` (the compiled -# extension on disk); the fake ModuleType does not. -_kernel_mod = pytest.importorskip( - "databricks_sql_kernel", - reason="use_kernel=True requires the databricks-sql-kernel package", -) -if not getattr(_kernel_mod, "__file__", None): +# These tests must run against the REAL databricks-sql-kernel wheel and +# must NOT silently pass when it's absent or shadowed. We distinguish +# three states explicitly so a misconfigured CI job can't look green: +# +# 1. Wheel genuinely not installed -> legitimate skip. +# 2. Wheel installed in the env but ``sys.modules`` currently holds a +# stub (the kernel UNIT tests inject a fake ``databricks_sql_kernel`` +# ModuleType; in a shared ``pytest tests/unit tests/e2e`` session +# that fake can still be resident when this file is collected) -> +# FAIL loudly. Silently skipping here is what made the coverage job +# look like it exercised the kernel when it didn't. +# 3. Wheel installed and importable for real -> run. +# +# "Installed in the env" is decided via importlib.metadata (the dist +# database on disk), which a ``sys.modules`` stub can't fake. The +# ``__file__`` check then tells a real compiled extension from a stub +# ModuleType. +import importlib.metadata as _ilm + +try: + _ilm.version("databricks-sql-kernel") + _kernel_installed = True +except _ilm.PackageNotFoundError: + _kernel_installed = False + +_kernel_mod = sys.modules.get("databricks_sql_kernel") +if _kernel_mod is None: + try: + import databricks_sql_kernel as _kernel_mod # type: ignore[import-not-found] + except ImportError: + _kernel_mod = None + +_kernel_is_real = _kernel_mod is not None and getattr(_kernel_mod, "__file__", None) + +if not _kernel_installed: + # State 1: nothing to test against. pytest.skip( - "databricks_sql_kernel is a test stub (no __file__); " - "install the real wheel to run kernel e2e tests", + "databricks-sql-kernel is not installed; " + "install the real wheel (pip install 'databricks-sql-connector[kernel]') " + "to run kernel e2e tests", allow_module_level=True, ) +elif not _kernel_is_real: + # State 2: the wheel IS installed but a stub is shadowing it. Do NOT + # skip — that would hide the fact that the kernel path never ran. + raise RuntimeError( + "databricks-sql-kernel is installed in this environment but " + "sys.modules holds a stub (no __file__) — the kernel e2e tests " + "would not actually exercise the real wheel. This usually means a " + "unit test's fake databricks_sql_kernel module is shadowing the " + "real one in a shared pytest session. Run the kernel e2e tests in " + "isolation (separate pytest invocation) so the real wheel loads." + ) @pytest.fixture(scope="module") @@ -80,9 +116,21 @@ def kernel_conn_params(connection_details): @pytest.fixture def conn(kernel_conn_params): """One-shot connection per test (the simple_test pattern the - existing e2e suite uses for cursor-level tests).""" + existing e2e suite uses for cursor-level tests). + + Asserts the connection actually routed through the kernel backend — + if ``use_kernel=True`` silently fell back to Thrift (e.g. a wiring + regression), these tests must fail rather than pass against the + wrong backend. + """ + from databricks.sql.backend.kernel.client import KernelDatabricksClient + c = sql.connect(**kernel_conn_params) try: + assert isinstance(c.session.backend, KernelDatabricksClient), ( + "use_kernel=True did not route through KernelDatabricksClient; " + f"got {type(c.session.backend).__name__}" + ) yield c finally: c.close() diff --git a/tests/e2e/test_kernel_tls.py b/tests/e2e/test_kernel_tls.py index 71dc25a24..2d1c42584 100644 --- a/tests/e2e/test_kernel_tls.py +++ b/tests/e2e/test_kernel_tls.py @@ -30,25 +30,46 @@ from __future__ import annotations import os +import sys import pytest import databricks.sql as sql from databricks.sql.exc import Error as DatabricksSqlError -# Same real-wheel guard as test_kernel_backend.py: a fake -# ``databricks_sql_kernel`` ModuleType injected by the unit tests has no -# ``__file__``; only a compiled wheel does. -_kernel_mod = pytest.importorskip( - "databricks_sql_kernel", - reason="use_kernel=True requires the databricks-sql-kernel package", -) -if not getattr(_kernel_mod, "__file__", None): +# Same real-wheel guard as test_kernel_backend.py — see the detailed +# rationale there. Skip only when the wheel is genuinely not installed; +# FAIL LOUDLY if it's installed but shadowed by a stub (so a misconfigured +# shared pytest session can't silently pass as covering the kernel). +import importlib.metadata as _ilm + +try: + _ilm.version("databricks-sql-kernel") + _kernel_installed = True +except _ilm.PackageNotFoundError: + _kernel_installed = False + +_kernel_mod = sys.modules.get("databricks_sql_kernel") +if _kernel_mod is None: + try: + import databricks_sql_kernel as _kernel_mod # type: ignore[import-not-found] + except ImportError: + _kernel_mod = None +_kernel_is_real = _kernel_mod is not None and getattr(_kernel_mod, "__file__", None) + +if not _kernel_installed: pytest.skip( - "databricks_sql_kernel is a test stub (no __file__); " - "install the real wheel to run kernel TLS e2e tests", + "databricks-sql-kernel is not installed; install the real wheel " + "to run kernel TLS e2e tests", allow_module_level=True, ) +elif not _kernel_is_real: + raise RuntimeError( + "databricks-sql-kernel is installed but sys.modules holds a stub " + "(no __file__) — the kernel TLS e2e tests would not exercise the " + "real wheel. Run them in isolation (separate pytest invocation) so " + "a unit-test fake module doesn't shadow the real one." + ) _MITM_CA = os.getenv("MITMPROXY_CA_CERT") if not _MITM_CA: diff --git a/tests/unit/test_kernel_client.py b/tests/unit/test_kernel_client.py index 44ed42781..79be53e64 100644 --- a/tests/unit/test_kernel_client.py +++ b/tests/unit/test_kernel_client.py @@ -61,6 +61,21 @@ def __init__( self.error_details_json = error_details_json +# These unit tests exercise the connector's error-mapping / wiring logic +# and need a *controllable* fake ``KernelError`` (to simulate arbitrary +# kernel error codes), so they install a fake ``databricks_sql_kernel`` +# into ``sys.modules`` unconditionally. +# +# IMPORTANT: this fake is session-global and shadows a real wheel if one +# is installed. Tests that need the REAL wheel (the use_kernel routing +# test in test_session.py, and the e2e suite in +# tests/e2e/test_kernel_backend.py) MUST be run in a SEPARATE pytest +# invocation from this file — never `pytest tests/unit tests/e2e` in one +# session when the real wheel is installed. Both of those real-wheel +# tests detect the shadowing (real wheel present but sys.modules holds a +# stub) and FAIL LOUDLY rather than silently skipping, so a CI job that +# accidentally mixes them will go red instead of falsely green. The +# kernel CI matrix runs the real-wheel tests as their own step. _fake_kernel_module = types.ModuleType("databricks_sql_kernel") _fake_kernel_module.KernelError = _FakeKernelError # type: ignore[attr-defined] _fake_kernel_module.Session = MagicMock() # type: ignore[attr-defined] diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index 27d2b96c7..ba008b103 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -525,3 +525,65 @@ def test_user_agent_entry_reaches_kernel_client_http_headers(self): assert "my-partner-app" in ua, f"UA was {ua!r}" finally: conn.close() + + +@pytest.mark.realkernel +class TestUseKernelRoutesThroughRealWheel: + """No-network proof that ``sql.connect(use_kernel=True)`` actually + routes through the REAL databricks-sql-kernel wheel — not a stub and + not a fallback to Thrift. + + This is the unit-side complement to the live e2e suite: it does not + need a warehouse (only the network boundary ``open_session`` is + mocked), but unlike the other kernel unit tests it does NOT fake the + wheel — the real ``KernelDatabricksClient`` is instantiated and its + ``_kernel_session`` is built from the real ``databricks_sql_kernel`` + ``Session``. Skips only when the real wheel is genuinely absent + (e.g. the no-kernel CI tier); it must never silently pass when the + wheel is present. + """ + + def _real_kernel_or_skip(self): + import importlib.metadata as ilm + + try: + ilm.version("databricks-sql-kernel") + except ilm.PackageNotFoundError: + pytest.skip("databricks-sql-kernel wheel not installed") + mod = __import__("databricks_sql_kernel") + if not getattr(mod, "__file__", None): + pytest.fail( + "databricks-sql-kernel is installed but sys.modules holds a " + "stub (no __file__) — a unit-test fake is shadowing the real " + "wheel; this routing test would not exercise the real kernel." + ) + + def test_connect_use_kernel_instantiates_real_kernel_backend(self): + self._real_kernel_or_skip() + + from databricks.sql.backend.kernel.client import KernelDatabricksClient + + # Mock only the network boundary: the real KernelDatabricksClient + # is constructed (building a real databricks_sql_kernel Session), + # but open_session() doesn't hit the wire. + with patch.object( + KernelDatabricksClient, + "open_session", + return_value=SessionId(BackendType.SEA, "sess-id", None), + ): + conn = databricks.sql.connect( + server_hostname="foo.cloud.databricks.com", + http_path="/sql/1.0/warehouses/abc", + use_kernel=True, + access_token="dapi-xyz", + enable_telemetry=False, + ) + try: + # The active backend is the REAL kernel client class. + assert isinstance(conn.session.backend, KernelDatabricksClient), ( + "use_kernel=True did not route through the real " + f"KernelDatabricksClient; got " + f"{type(conn.session.backend).__name__}" + ) + finally: + conn.close()