From 5bc4d5d51b6394763737b0f1611a1fa21043887c Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 01:30:37 -0700 Subject: [PATCH 1/7] feat(python): expose register_table_function for Paimon UDTFs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `SQLContext.register_table_function(name, default_database=None)` to the Python binding so Paimon table-valued functions can be registered from Python — the binding previously had no way to reach `register_udtf`. A single dispatch method keeps the API surface stable: it currently supports `vector_search` and `full_text_search`, and the same `match` will pick up `referenced_files_size` / `physical_files_size` once those land, without changing the Python signature. The function binds to the current catalog. So the binding can obtain that catalog without keeping a duplicate handle of its own, `SQLContext::current_catalog` is made public. The binding also enables the `fulltext` feature so `register_full_text_search` is available. Co-Authored-By: Claude Opus 4.7 --- bindings/python/Cargo.toml | 2 +- bindings/python/src/context.rs | 36 ++++++++++++++++++- .../datafusion/src/sql_context.rs | 7 +++- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 6ed24065..0c3b487c 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -31,6 +31,6 @@ arrow = { workspace = true, features = ["pyarrow"] } datafusion = { workspace = true } datafusion-ffi = { workspace = true } paimon = { path = "../../crates/paimon", features = ["storage-all"] } -paimon-datafusion = { path = "../../crates/integrations/datafusion" } +paimon-datafusion = { path = "../../crates/integrations/datafusion", features = ["fulltext"] } pyo3 = { version = "0.28", features = ["abi3-py310"] } tokio = { workspace = true } diff --git a/bindings/python/src/context.rs b/bindings/python/src/context.rs index e1050d38..1b7e3250 100644 --- a/bindings/python/src/context.rs +++ b/bindings/python/src/context.rs @@ -23,7 +23,10 @@ use datafusion::catalog::CatalogProvider; use datafusion_ffi::catalog_provider::FFI_CatalogProvider; use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; use paimon::{CatalogFactory, Options}; -use paimon_datafusion::{PaimonCatalogProvider, SQLContext}; +use paimon_datafusion::{ + register_full_text_search, register_vector_search, PaimonCatalogProvider, SQLContext, +}; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::PyCapsule; @@ -148,6 +151,37 @@ impl PySQLContext { .map_err(df_to_py_err) } + /// Registers a built-in Paimon table-valued function (UDTF) on the session + /// so it can be used in SQL, e.g. + /// `SELECT * FROM vector_search('items', 'embedding', '[1.0, 0.0]', 10)`. + /// + /// `name` selects the function; supported values are `vector_search` and + /// `full_text_search`. The function is bound to the current catalog, so a + /// catalog must already be registered (the first `register_catalog` call + /// also sets it current). `default_database` defaults to `"default"` and + /// resolves the table-name argument the function receives in SQL. + #[pyo3(signature = (name, default_database=None))] + fn register_table_function( + &self, + name: String, + default_database: Option, + ) -> PyResult<()> { + let catalog = self.inner.current_catalog().map_err(df_to_py_err)?; + let default_database = default_database.as_deref().unwrap_or("default"); + let ctx = self.inner.ctx(); + match name.as_str() { + "vector_search" => register_vector_search(ctx, catalog, default_database), + "full_text_search" => register_full_text_search(ctx, catalog, default_database), + other => { + return Err(PyValueError::new_err(format!( + "unknown table function '{other}'; \ + supported: 'vector_search', 'full_text_search'" + ))) + } + } + Ok(()) + } + fn sql(&self, py: Python<'_>, sql: String) -> PyResult>> { let rt = runtime(); let batches = rt.block_on(async { diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index b54f443d..141832b5 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -1220,7 +1220,12 @@ impl SQLContext { .clone() } - fn current_catalog(&self) -> DFResult> { + /// Returns the Paimon catalog currently set as default. + /// + /// Exposed so callers that need the registered [`Catalog`] (for example to + /// register a table-valued function against it) can retrieve it without + /// keeping a duplicate handle of their own. + pub fn current_catalog(&self) -> DFResult> { let name = self.current_catalog_name(); self.catalogs.get(&name).cloned().ok_or_else(|| { DataFusionError::Plan( From c261b80a43d7127f741f0486c2988429e2b7590e Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 01:45:20 -0700 Subject: [PATCH 2/7] test(python): cover register_table_function Add tests for `SQLContext.register_table_function`: - vector_search / full_text_search register without error - the optional default_database keyword is accepted - an unknown function name raises a clear error - calling it before any catalog is registered raises Registration alone touches neither the Lumina nor Tantivy runtime, so these tests are deterministic and need no index fixtures. Co-Authored-By: Claude Opus 4.7 --- bindings/python/tests/test_datafusion.py | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/bindings/python/tests/test_datafusion.py b/bindings/python/tests/test_datafusion.py index 5e4e5e99..5ddc9e29 100644 --- a/bindings/python/tests/test_datafusion.py +++ b/bindings/python/tests/test_datafusion.py @@ -177,3 +177,52 @@ def test_register_batch_invalid_catalog(): assert False, "Expected an error for unknown catalog" except Exception as e: assert "unknown_catalog" in str(e).lower() or "not a paimon" in str(e).lower() or "unknown" in str(e).lower() + + +def test_register_table_function_vector_search(): + with tempfile.TemporaryDirectory() as warehouse: + ctx = SQLContext() + ctx.register_catalog("paimon", {"warehouse": warehouse}) + + # Registering against the current catalog should not raise. + ctx.register_table_function("vector_search") + + +def test_register_table_function_full_text_search(): + with tempfile.TemporaryDirectory() as warehouse: + ctx = SQLContext() + ctx.register_catalog("paimon", {"warehouse": warehouse}) + + ctx.register_table_function("full_text_search") + + +def test_register_table_function_with_default_database(): + with tempfile.TemporaryDirectory() as warehouse: + ctx = SQLContext() + ctx.register_catalog("paimon", {"warehouse": warehouse}) + + # The optional default_database keyword is accepted. + ctx.register_table_function("vector_search", default_database="default") + + +def test_register_table_function_unknown_name(): + with tempfile.TemporaryDirectory() as warehouse: + ctx = SQLContext() + ctx.register_catalog("paimon", {"warehouse": warehouse}) + + try: + ctx.register_table_function("does_not_exist") + assert False, "Expected an error for an unknown table function" + except Exception as e: + assert "unknown table function" in str(e).lower() + assert "does_not_exist" in str(e) + + +def test_register_table_function_without_catalog(): + # With no catalog registered there is no current catalog to bind to. + ctx = SQLContext() + try: + ctx.register_table_function("vector_search") + assert False, "Expected an error when no catalog is registered" + except Exception as e: + assert "catalog" in str(e).lower() From 9ee59678bb038df1ba4f9063e0e6ddcbe91d59b0 Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 19:31:34 -0700 Subject: [PATCH 3/7] refactor: auto-register table functions on catalog registration Per review: register the built-in table-valued functions in Rust by default when a catalog is registered, instead of exposing an explicit register_table_function method on the Python binding. SQLContext::register_catalog now registers vector_search, full_text_search, referenced_files_size and physical_files_size against the catalog being registered, so every SQLContext user gets them with no extra call. The Python register_table_function method and the SQLContext::current_catalog visibility change are reverted; the binding keeps the fulltext feature so full_text_search compiles in. Co-Authored-By: Claude Opus 4.7 --- bindings/python/src/context.rs | 36 +------------ bindings/python/tests/test_datafusion.py | 53 +++++-------------- .../datafusion/src/sql_context.rs | 26 ++++++--- 3 files changed, 33 insertions(+), 82 deletions(-) diff --git a/bindings/python/src/context.rs b/bindings/python/src/context.rs index 1b7e3250..e1050d38 100644 --- a/bindings/python/src/context.rs +++ b/bindings/python/src/context.rs @@ -23,10 +23,7 @@ use datafusion::catalog::CatalogProvider; use datafusion_ffi::catalog_provider::FFI_CatalogProvider; use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; use paimon::{CatalogFactory, Options}; -use paimon_datafusion::{ - register_full_text_search, register_vector_search, PaimonCatalogProvider, SQLContext, -}; -use pyo3::exceptions::PyValueError; +use paimon_datafusion::{PaimonCatalogProvider, SQLContext}; use pyo3::prelude::*; use pyo3::types::PyCapsule; @@ -151,37 +148,6 @@ impl PySQLContext { .map_err(df_to_py_err) } - /// Registers a built-in Paimon table-valued function (UDTF) on the session - /// so it can be used in SQL, e.g. - /// `SELECT * FROM vector_search('items', 'embedding', '[1.0, 0.0]', 10)`. - /// - /// `name` selects the function; supported values are `vector_search` and - /// `full_text_search`. The function is bound to the current catalog, so a - /// catalog must already be registered (the first `register_catalog` call - /// also sets it current). `default_database` defaults to `"default"` and - /// resolves the table-name argument the function receives in SQL. - #[pyo3(signature = (name, default_database=None))] - fn register_table_function( - &self, - name: String, - default_database: Option, - ) -> PyResult<()> { - let catalog = self.inner.current_catalog().map_err(df_to_py_err)?; - let default_database = default_database.as_deref().unwrap_or("default"); - let ctx = self.inner.ctx(); - match name.as_str() { - "vector_search" => register_vector_search(ctx, catalog, default_database), - "full_text_search" => register_full_text_search(ctx, catalog, default_database), - other => { - return Err(PyValueError::new_err(format!( - "unknown table function '{other}'; \ - supported: 'vector_search', 'full_text_search'" - ))) - } - } - Ok(()) - } - fn sql(&self, py: Python<'_>, sql: String) -> PyResult>> { let rt = runtime(); let batches = rt.block_on(async { diff --git a/bindings/python/tests/test_datafusion.py b/bindings/python/tests/test_datafusion.py index 5ddc9e29..9658bb08 100644 --- a/bindings/python/tests/test_datafusion.py +++ b/bindings/python/tests/test_datafusion.py @@ -179,50 +179,21 @@ def test_register_batch_invalid_catalog(): assert "unknown_catalog" in str(e).lower() or "not a paimon" in str(e).lower() or "unknown" in str(e).lower() -def test_register_table_function_vector_search(): +def test_table_functions_registered_with_catalog(): + """register_catalog auto-registers the built-in table-valued functions, so + they are callable in SQL without any extra registration step.""" with tempfile.TemporaryDirectory() as warehouse: ctx = SQLContext() ctx.register_catalog("paimon", {"warehouse": warehouse}) + ctx.sql("CREATE SCHEMA paimon.test_db") + ctx.sql("CREATE TABLE paimon.test_db.t (id INT, name STRING)") + ctx.sql("INSERT INTO paimon.test_db.t VALUES (1, 'alice')") - # Registering against the current catalog should not raise. - ctx.register_table_function("vector_search") - - -def test_register_table_function_full_text_search(): - with tempfile.TemporaryDirectory() as warehouse: - ctx = SQLContext() - ctx.register_catalog("paimon", {"warehouse": warehouse}) - - ctx.register_table_function("full_text_search") - - -def test_register_table_function_with_default_database(): - with tempfile.TemporaryDirectory() as warehouse: - ctx = SQLContext() - ctx.register_catalog("paimon", {"warehouse": warehouse}) - - # The optional default_database keyword is accepted. - ctx.register_table_function("vector_search", default_database="default") - + referenced = ctx.sql("SELECT * FROM referenced_files_size('test_db.t')") + assert pa.Table.from_batches(referenced).num_rows > 0 -def test_register_table_function_unknown_name(): - with tempfile.TemporaryDirectory() as warehouse: - ctx = SQLContext() - ctx.register_catalog("paimon", {"warehouse": warehouse}) + physical = ctx.sql("SELECT * FROM physical_files_size('test_db.t')") + assert pa.Table.from_batches(physical).num_rows > 0 - try: - ctx.register_table_function("does_not_exist") - assert False, "Expected an error for an unknown table function" - except Exception as e: - assert "unknown table function" in str(e).lower() - assert "does_not_exist" in str(e) - - -def test_register_table_function_without_catalog(): - # With no catalog registered there is no current catalog to bind to. - ctx = SQLContext() - try: - ctx.register_table_function("vector_search") - assert False, "Expected an error when no catalog is registered" - except Exception as e: - assert "catalog" in str(e).lower() + ctx.sql("DROP TABLE paimon.test_db.t") + ctx.sql("DROP SCHEMA paimon.test_db") diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index fd8fbc6c..062fb839 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -136,6 +136,25 @@ impl SQLContext { self.dynamic_options.clone(), )), ); + // Register the built-in table-valued functions against this catalog so + // they are usable in SQL without any extra registration call. + crate::vector_search::register_vector_search(&self.ctx, catalog.clone(), default_db); + #[cfg(feature = "fulltext")] + crate::full_text_search::register_full_text_search( + &self.ctx, + catalog.clone(), + default_db, + ); + crate::referenced_files_size::register_referenced_files_size( + &self.ctx, + catalog.clone(), + default_db, + ); + crate::physical_files_size::register_physical_files_size( + &self.ctx, + catalog.clone(), + default_db, + ); self.catalogs.insert(catalog_name.clone(), catalog); if is_first { self.set_current_catalog(catalog_name).await?; @@ -1220,12 +1239,7 @@ impl SQLContext { .clone() } - /// Returns the Paimon catalog currently set as default. - /// - /// Exposed so callers that need the registered [`Catalog`] (for example to - /// register a table-valued function against it) can retrieve it without - /// keeping a duplicate handle of their own. - pub fn current_catalog(&self) -> DFResult> { + fn current_catalog(&self) -> DFResult> { let name = self.current_catalog_name(); self.catalogs.get(&name).cloned().ok_or_else(|| { DataFusionError::Plan( From 7c6b5579130fb9866963460ea7ed86ca99fc00ff Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 20:13:25 -0700 Subject: [PATCH 4/7] style: satisfy rustfmt for full_text_search registration call The register_full_text_search call fits within the line width on a single line; rustfmt rejected the wrapped form. Co-Authored-By: Claude Opus 4.7 --- crates/integrations/datafusion/src/sql_context.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index 062fb839..a9988e78 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -140,11 +140,7 @@ impl SQLContext { // they are usable in SQL without any extra registration call. crate::vector_search::register_vector_search(&self.ctx, catalog.clone(), default_db); #[cfg(feature = "fulltext")] - crate::full_text_search::register_full_text_search( - &self.ctx, - catalog.clone(), - default_db, - ); + crate::full_text_search::register_full_text_search(&self.ctx, catalog.clone(), default_db); crate::referenced_files_size::register_referenced_files_size( &self.ctx, catalog.clone(), From 1a292db3c79477a81a853c8a1c3d5d777e4fe803 Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 20:30:32 -0700 Subject: [PATCH 5/7] refactor: drop files_size from auto-registration (now system tables) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upstream #325 converted referenced_files_size / physical_files_size from table functions to system tables, so they no longer have register_* functions. register_catalog now auto-registers only the remaining UDTFs — vector_search and full_text_search. The binding test is reworked accordingly: it verifies the two UDTFs are registered by triggering their own argument-count validation. Co-Authored-By: Claude Opus 4.7 --- bindings/python/tests/test_datafusion.py | 23 ++++++++----------- .../datafusion/src/sql_context.rs | 10 -------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/bindings/python/tests/test_datafusion.py b/bindings/python/tests/test_datafusion.py index 9658bb08..2576b7c2 100644 --- a/bindings/python/tests/test_datafusion.py +++ b/bindings/python/tests/test_datafusion.py @@ -180,20 +180,17 @@ def test_register_batch_invalid_catalog(): def test_table_functions_registered_with_catalog(): - """register_catalog auto-registers the built-in table-valued functions, so - they are callable in SQL without any extra registration step.""" + """register_catalog auto-registers vector_search / full_text_search as + UDTFs. Calling one with the wrong argument count surfaces the function's + own validation error, which proves it is registered — an unregistered + name would instead fail with 'table function not found'.""" with tempfile.TemporaryDirectory() as warehouse: ctx = SQLContext() ctx.register_catalog("paimon", {"warehouse": warehouse}) - ctx.sql("CREATE SCHEMA paimon.test_db") - ctx.sql("CREATE TABLE paimon.test_db.t (id INT, name STRING)") - ctx.sql("INSERT INTO paimon.test_db.t VALUES (1, 'alice')") - - referenced = ctx.sql("SELECT * FROM referenced_files_size('test_db.t')") - assert pa.Table.from_batches(referenced).num_rows > 0 - physical = ctx.sql("SELECT * FROM physical_files_size('test_db.t')") - assert pa.Table.from_batches(physical).num_rows > 0 - - ctx.sql("DROP TABLE paimon.test_db.t") - ctx.sql("DROP SCHEMA paimon.test_db") + for fn in ("vector_search", "full_text_search"): + try: + ctx.sql(f"SELECT * FROM {fn}('only_one_arg')") + assert False, f"expected {fn} to reject a single argument" + except Exception as e: + assert "requires 4 arguments" in str(e), str(e) diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index a9988e78..34c2f66c 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -141,16 +141,6 @@ impl SQLContext { crate::vector_search::register_vector_search(&self.ctx, catalog.clone(), default_db); #[cfg(feature = "fulltext")] crate::full_text_search::register_full_text_search(&self.ctx, catalog.clone(), default_db); - crate::referenced_files_size::register_referenced_files_size( - &self.ctx, - catalog.clone(), - default_db, - ); - crate::physical_files_size::register_physical_files_size( - &self.ctx, - catalog.clone(), - default_db, - ); self.catalogs.insert(catalog_name.clone(), catalog); if is_first { self.set_current_catalog(catalog_name).await?; From 416a41948f2314e0d18cc8ca8456cbd34dbdfe19 Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 23:31:27 -0700 Subject: [PATCH 6/7] refactor: extract register_table_functions helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: pull the inline built-in table-function registration in register_catalog into a dedicated function. It is the single place that knows the built-in table functions — new ones are added there. Co-Authored-By: Claude Opus 4.7 --- .../datafusion/src/sql_context.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index 34c2f66c..f77b79e2 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -136,11 +136,7 @@ impl SQLContext { self.dynamic_options.clone(), )), ); - // Register the built-in table-valued functions against this catalog so - // they are usable in SQL without any extra registration call. - crate::vector_search::register_vector_search(&self.ctx, catalog.clone(), default_db); - #[cfg(feature = "fulltext")] - crate::full_text_search::register_full_text_search(&self.ctx, catalog.clone(), default_db); + register_table_functions(&self.ctx, &catalog, default_db); self.catalogs.insert(catalog_name.clone(), catalog); if is_first { self.set_current_catalog(catalog_name).await?; @@ -2307,6 +2303,19 @@ fn ok_result(ctx: &SessionContext) -> DFResult { Ok(df) } +/// Registers the built-in table-valued functions against `catalog` so they can +/// be used in SQL without any extra setup call. Called for every catalog +/// registered on the context; add new built-in table functions here. +fn register_table_functions( + ctx: &SessionContext, + catalog: &Arc, + default_database: &str, +) { + crate::vector_search::register_vector_search(ctx, Arc::clone(catalog), default_database); + #[cfg(feature = "fulltext")] + crate::full_text_search::register_full_text_search(ctx, Arc::clone(catalog), default_database); +} + #[cfg(test)] mod tests { use super::*; From 4388e74c4f688efe8ddac7b0715c0725e5eb71b7 Mon Sep 17 00:00:00 2001 From: shaoyijie Date: Mon, 18 May 2026 23:34:16 -0700 Subject: [PATCH 7/7] docs: note that SQLContext auto-registers the table functions The Vector Search / Full-Text Search registration sections still told readers to call register_* manually. With a SQLContext that is now automatic on register_catalog; the explicit call is only needed with a raw SessionContext. Co-Authored-By: Claude Opus 4.7 --- docs/src/sql.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/src/sql.md b/docs/src/sql.md index 94eef718..325d236e 100644 --- a/docs/src/sql.md +++ b/docs/src/sql.md @@ -53,7 +53,7 @@ async fn example() -> Result<(), Box> { } ``` -`SQLContext::new` creates a session context with the Paimon relation planner pre-registered. Use `register_catalog` to add one or more Paimon catalogs. It also manages session-scoped dynamic options internally for `SET`/`RESET` support. +`SQLContext::new` creates a session context with the Paimon relation planner pre-registered. Use `register_catalog` to add one or more Paimon catalogs; registering a catalog also registers the built-in table-valued functions (`vector_search`, `full_text_search`) against it. It also manages session-scoped dynamic options internally for `SET`/`RESET` support. ## Data Types @@ -445,6 +445,10 @@ Paimon supports approximate nearest neighbor (ANN) vector search via the Lumina ### Registration +When you use a `SQLContext`, `vector_search` is registered automatically for every catalog you register — no extra setup is needed. + +With a raw DataFusion `SessionContext`, register it explicitly: + ```rust use paimon_datafusion::register_vector_search; @@ -510,6 +514,10 @@ paimon-datafusion = { version = "0.1.0", features = ["fulltext"] } ### Registration +When you use a `SQLContext`, `full_text_search` is registered automatically for every catalog you register (when the `fulltext` feature is enabled) — no extra setup is needed. + +With a raw DataFusion `SessionContext`, register it explicitly: + ```rust use paimon_datafusion::register_full_text_search;