diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 6ed24065..0c3b487c 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -31,6 +31,6 @@ arrow = { workspace = true, features = ["pyarrow"] } datafusion = { workspace = true } datafusion-ffi = { workspace = true } paimon = { path = "../../crates/paimon", features = ["storage-all"] } -paimon-datafusion = { path = "../../crates/integrations/datafusion" } +paimon-datafusion = { path = "../../crates/integrations/datafusion", features = ["fulltext"] } pyo3 = { version = "0.28", features = ["abi3-py310"] } tokio = { workspace = true } diff --git a/bindings/python/tests/test_datafusion.py b/bindings/python/tests/test_datafusion.py index 5e4e5e99..2576b7c2 100644 --- a/bindings/python/tests/test_datafusion.py +++ b/bindings/python/tests/test_datafusion.py @@ -177,3 +177,20 @@ def test_register_batch_invalid_catalog(): assert False, "Expected an error for unknown catalog" except Exception as e: assert "unknown_catalog" in str(e).lower() or "not a paimon" in str(e).lower() or "unknown" in str(e).lower() + + +def test_table_functions_registered_with_catalog(): + """register_catalog auto-registers vector_search / full_text_search as + UDTFs. Calling one with the wrong argument count surfaces the function's + own validation error, which proves it is registered — an unregistered + name would instead fail with 'table function not found'.""" + with tempfile.TemporaryDirectory() as warehouse: + ctx = SQLContext() + ctx.register_catalog("paimon", {"warehouse": warehouse}) + + for fn in ("vector_search", "full_text_search"): + try: + ctx.sql(f"SELECT * FROM {fn}('only_one_arg')") + assert False, f"expected {fn} to reject a single argument" + except Exception as e: + assert "requires 4 arguments" in str(e), str(e) diff --git a/crates/integrations/datafusion/src/sql_context.rs b/crates/integrations/datafusion/src/sql_context.rs index 973e263f..f77b79e2 100644 --- a/crates/integrations/datafusion/src/sql_context.rs +++ b/crates/integrations/datafusion/src/sql_context.rs @@ -136,6 +136,7 @@ impl SQLContext { self.dynamic_options.clone(), )), ); + register_table_functions(&self.ctx, &catalog, default_db); self.catalogs.insert(catalog_name.clone(), catalog); if is_first { self.set_current_catalog(catalog_name).await?; @@ -2302,6 +2303,19 @@ fn ok_result(ctx: &SessionContext) -> DFResult { Ok(df) } +/// Registers the built-in table-valued functions against `catalog` so they can +/// be used in SQL without any extra setup call. Called for every catalog +/// registered on the context; add new built-in table functions here. +fn register_table_functions( + ctx: &SessionContext, + catalog: &Arc, + default_database: &str, +) { + crate::vector_search::register_vector_search(ctx, Arc::clone(catalog), default_database); + #[cfg(feature = "fulltext")] + crate::full_text_search::register_full_text_search(ctx, Arc::clone(catalog), default_database); +} + #[cfg(test)] mod tests { use super::*; diff --git a/docs/src/sql.md b/docs/src/sql.md index 94eef718..325d236e 100644 --- a/docs/src/sql.md +++ b/docs/src/sql.md @@ -53,7 +53,7 @@ async fn example() -> Result<(), Box> { } ``` -`SQLContext::new` creates a session context with the Paimon relation planner pre-registered. Use `register_catalog` to add one or more Paimon catalogs. It also manages session-scoped dynamic options internally for `SET`/`RESET` support. +`SQLContext::new` creates a session context with the Paimon relation planner pre-registered. Use `register_catalog` to add one or more Paimon catalogs; registering a catalog also registers the built-in table-valued functions (`vector_search`, `full_text_search`) against it. It also manages session-scoped dynamic options internally for `SET`/`RESET` support. ## Data Types @@ -445,6 +445,10 @@ Paimon supports approximate nearest neighbor (ANN) vector search via the Lumina ### Registration +When you use a `SQLContext`, `vector_search` is registered automatically for every catalog you register — no extra setup is needed. + +With a raw DataFusion `SessionContext`, register it explicitly: + ```rust use paimon_datafusion::register_vector_search; @@ -510,6 +514,10 @@ paimon-datafusion = { version = "0.1.0", features = ["fulltext"] } ### Registration +When you use a `SQLContext`, `full_text_search` is registered automatically for every catalog you register (when the `fulltext` feature is enabled) — no extra setup is needed. + +With a raw DataFusion `SessionContext`, register it explicitly: + ```rust use paimon_datafusion::register_full_text_search;