diff --git a/crates/bashkit-python/src/lib.rs b/crates/bashkit-python/src/lib.rs index 75e70564..c0649735 100644 --- a/crates/bashkit-python/src/lib.rs +++ b/crates/bashkit-python/src/lib.rs @@ -2194,7 +2194,7 @@ struct PyPrivateAsyncLoop { // all run_until_complete calls happen on the thread that created the loop, // satisfying asyncio's thread-affinity requirement (review comment on // spawn_blocking scheduling successive callbacks on different OS threads). - worker_tx: StdMutex>>, + worker_tx: StdMutex>>, // Cached Python helper for background-thread fallback (Jupyter/IPython compatibility). bg_thread_runner: StdMutex>>, } @@ -2212,12 +2212,17 @@ impl PyPrivateAsyncLoop { // keeps it there for the lifetime of the PyPrivateAsyncLoop. This pins all // run_until_complete calls to a single thread, matching asyncio's thread- // affinity contract. - fn ensure_worker_tx(&self) -> PyResult> { + fn ensure_worker_tx(&self) -> PyResult> { let mut guard = self.worker_tx.lock().expect("private loop worker tx lock"); if let Some(ref tx) = *guard { return Ok(tx.clone()); } - let (tx, rx) = std::sync::mpsc::sync_channel::(0); + // Unbounded channel: each caller enqueues exactly one item and then + // blocks on result_rx.recv(), so the queue depth equals the number of + // concurrent run_awaitable calls — naturally bounded by the execution + // model. Using unbounded (vs sync_channel(0)) ensures send() never + // blocks while the caller holds the GIL (TM-PY-030 variant 1). + let (tx, rx) = std::sync::mpsc::channel::(); std::thread::Builder::new() .name("bashkit-py-loop".into()) .spawn(move || { @@ -2329,8 +2334,9 @@ def _run(coro, ctx): // Dispatch to the dedicated worker thread that owns the asyncio event loop. // This ensures all run_until_complete calls happen on the same OS thread, - // preserving asyncio thread-affinity. The GIL is released while waiting so - // the worker thread can acquire it to run Python. + // preserving asyncio thread-affinity. The work channel is unbounded so + // sending cannot rendezvous with worker startup while the caller holds + // the GIL; the result wait below releases the GIL for Python execution. let tx = self.ensure_worker_tx()?; let (result_tx, result_rx) = std::sync::mpsc::sync_channel(0); let item = PrivateLoopWorkItem { @@ -2338,12 +2344,10 @@ def _run(coro, ctx): context: context.clone_ref(py), result_tx, }; - // THREAT[TM-PY-030]: detach (release the GIL) around BOTH the send and - // the recv. The channel is a rendezvous (capacity 0), so send blocks - // until the worker receives — and on first use the worker must acquire - // the GIL to create its event loop before it ever reaches recv(). - // Sending while attached therefore deadlocks the whole process: - // dispatcher holds the GIL waiting on send, worker waits on the GIL. + // THREAT[TM-PY-030]: detach (release the GIL) for the result rendezvous. + // The work channel is unbounded so send() returns immediately without + // blocking; only result_rx.recv() is a true blocking wait. GIL is still + // released across the entire block so the worker can attach at any point. // `move` ensures result_rx (Send, not Sync) is owned by the closure. py.detach(move || { tx.send(item) diff --git a/crates/bashkit-python/tests/test_async_callbacks.py b/crates/bashkit-python/tests/test_async_callbacks.py index 430d3067..883a1f47 100644 --- a/crates/bashkit-python/tests/test_async_callbacks.py +++ b/crates/bashkit-python/tests/test_async_callbacks.py @@ -11,6 +11,11 @@ import asyncio import contextvars import gc +import glob as _glob +import os +import subprocess +import sys +import textwrap import threading import time import weakref @@ -39,6 +44,62 @@ def _collect_between_tests(): # =========================================================================== +def test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock(): + """First private-loop async callback returns instead of deadlocking the GIL.""" + + code = textwrap.dedent( + r""" + import asyncio + from bashkit import ScriptedTool + + async def greet(params, stdin=None): + await asyncio.sleep(0) + return "ok\n" + + tool = ScriptedTool("api") + tool.add_tool("greet", "Greet", callback=greet) + result = tool.execute_sync("greet") + print(result.exit_code) + print(result.stdout.strip()) + """ + ) + + # Build a PYTHONPATH that points directly at the directory that contains + # both bashkit/__init__.py AND the compiled bashkit._bashkit extension. + # We scan sys.path with glob rather than using find_spec: pytest imports + # bashkit from the source tree before this test runs, so find_spec looks + # for _bashkit inside that source package and returns None — leaving + # pkg_root unset and the subprocess still pointing at the source tree. + pkg_root = next( + ( + p + for p in sys.path + if p + and ( + _glob.glob(os.path.join(p, "bashkit", "_bashkit*.so")) + or _glob.glob(os.path.join(p, "bashkit", "_bashkit*.pyd")) + ) + ), + None, + ) + env = { + **os.environ, + "PYTHONPATH": (pkg_root + os.pathsep if pkg_root else "") + os.environ.get("PYTHONPATH", ""), + } + completed = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + timeout=10.0, + check=False, + cwd="/", + env=env, + ) + + assert completed.returncode == 0, completed.stderr + assert completed.stdout.splitlines() == ["0", "ok"] + + def test_async_callback_execute_sync_honors_timeout(): """execute_sync() timeout preempts slow async callbacks on private loop.""" diff --git a/specs/threat-model.md b/specs/threat-model.md index 4ac136ea..fe6f531a 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -1745,7 +1745,7 @@ caller's GIL hold. | TM-PY-026 | reset() discards security config | `BashTool.reset()` creates new `Bash` with bare builder, dropping all configured limits | `PyBash::reset` and `BashTool::reset` rebuild via `replace_live_bash_with_builder` + `build_live_builder`, which preserves the original limits, env, and registered builtins | **MITIGATED** | | TM-PY-027 | Unbounded recursion in JSON conversion | `py_to_json`/`json_to_py` recurse without depth limit on nested dicts/lists | `json_to_py_inner`, `py_to_json_inner`, and the MontyObject converters all carry a `depth` arg; depth > `MAX_NESTING_DEPTH = 64` raises `ValueError("… nesting depth exceeds maximum of 64")` | **MITIGATED** | -| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Dispatch detaches around both the send and the receive; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** | +| TM-PY-030 | GIL deadlock / exit crash via async-callback private loop | Private-loop dispatch blocked on a rendezvous channel while attached (GIL held); pyclass dealloc joined in-flight blocking tasks that must re-attach to finish (froze the whole process, observed as a 6 h CI hang); worker thread attached during interpreter finalization to close its loop (SIGABRT at process exit) | Work dispatch uses an unbounded `channel()` so send() never blocks; result rendezvous runs inside `py.detach(...)` so GIL is released for the blocking wait; `PyRuntime` drop shuts the tokio runtime down with `shutdown_background()` instead of a blocking join; worker exit path never touches Python (loop closed via `BaseEventLoop.__del__`) | **MITIGATED** | **TM-PY-026** (mitigated): `PyBash::reset` and `BashTool::reset` (`crates/bashkit-python/src/lib.rs`) rebuild the inner `Bash` via `replace_live_bash_with_builder` + `build_live_builder`, which @@ -1758,25 +1758,27 @@ MontyObject converters in `crates/bashkit-python/src/lib.rs` carry a `depth: usi At `depth > MAX_NESTING_DEPTH = 64`, conversion raises a Python `ValueError` instead of recursing. Coverage: `tests/_security_advanced.py::JsonConversionBoundariesTests`. -**TM-PY-030** (mitigated): two deadlock variants in the async-callback private-loop +**TM-PY-030** (mitigated): three deadlock/crash variants in the async-callback private-loop machinery (`crates/bashkit-python/src/lib.rs`). (1) `PyPrivateAsyncLoop::run_awaitable` -sent work to the dedicated worker thread over a `sync_channel(0)` rendezvous while -attached; on first use the worker must attach (acquire the GIL) to create its asyncio -loop before it can `recv()`, so dispatcher and worker waited on each other. The send -and receive now both run inside `py.detach(...)`. (2) Pyclass dealloc runs attached -and dropped the last `Arc`; tokio's default `Runtime::drop` joins in-flight -blocking tasks, and an abandoned (timed-out) callback task must re-attach to finish — -freezing the entire interpreter. The `PyRuntime` handle now shuts the runtime down -with `shutdown_background()` on last drop. (3) The private-loop worker thread called -`Python::attach` on its exit path to close its asyncio loop; the worker usually wakes -because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` — -attaching during finalization fatals CPython (`PyGILState_Release`, SIGABRT at -interpreter exit; `Python::try_attach` cannot detect finalization before 3.13). The -worker exit path no longer touches Python: the loop's `Py` ref is dropped unattached -(deferred decref) and the loop is closed by `BaseEventLoop.__del__`. Regression tests: -`tests/test_async_callbacks.py::test_async_callback_execute_sync_honors_timeout`, -`…::test_dealloc_during_inflight_callback_does_not_deadlock`; variant (3) is covered -by the `langgraph_async_tool.py` example run in the Python CI Examples job. +originally sent work over a `sync_channel(0)` rendezvous while attached; on first use the +worker must attach (acquire the GIL) to create its asyncio loop before it can `recv()`, so +dispatcher and worker waited on each other. The work dispatch now uses an unbounded +`channel()` so `send()` never blocks regardless of worker state; the result rendezvous +(`result_rx.recv()`) still runs inside `py.detach(...)` to release the GIL for the blocking +wait. Regression test: `test_async_callback_execute_sync_first_private_loop_call_does_not_deadlock`. +(2) Pyclass dealloc runs attached and dropped the last `Arc`; tokio's default +`Runtime::drop` joins in-flight blocking tasks, and an abandoned (timed-out) callback task +must re-attach to finish — freezing the entire interpreter. The `PyRuntime` handle now shuts +the runtime down with `shutdown_background()` on last drop. Regression test: +`test_dealloc_during_inflight_callback_does_not_deadlock`. (3) The private-loop worker thread +called `Python::attach` on its exit path to close its asyncio loop; the worker usually wakes +because the engine was gc'd, and that gc commonly runs inside `Py_Finalize` — attaching during +finalization fatals CPython (`PyGILState_Release`, SIGABRT at interpreter exit; +`Python::try_attach` cannot detect finalization before 3.13). The worker exit path no longer +touches Python: the loop's `Py` ref is dropped unattached (deferred decref) and the loop is +closed by `BaseEventLoop.__del__`. Variant (3) is covered by the `langgraph_async_tool.py` +example run in the Python CI Examples job. Regression test also: +`test_async_callback_execute_sync_honors_timeout`. | TM-PY-029 | Host clock information disclosure | `datetime.date.today()` / `datetime.datetime.now()` expose host system time and timezone | Intentional — required for correct datetime semantics | **ACCEPTED** |