diff --git a/env.example b/env.example index 191a8ab9..c738072b 100644 --- a/env.example +++ b/env.example @@ -9,6 +9,10 @@ # BUB_MODEL=openrouter:openrouter/free # BUB_MAX_STEPS=50 # BUB_MAX_TOKENS=16384 +# Max UTF-8 bytes of a single tool result kept inline; larger output is +# truncated and spilled to /tool-output (0 disables). Guards against +# 413 Request Entity Too Large from oversized tool output. +# BUB_MAX_TOOL_RESULT_BYTES=131072 # BUB_MODEL_TIMEOUT_SECONDS=300 # BUB_HOME=~/.bub diff --git a/src/bub/builtin/model_runner.py b/src/bub/builtin/model_runner.py index 2664b964..6a552b14 100644 --- a/src/bub/builtin/model_runner.py +++ b/src/bub/builtin/model_runner.py @@ -26,11 +26,12 @@ from bub.builtin.settings import AgentSettings, ModelCandidate from bub.builtin.tape import Tape +from bub.builtin.tool_output import cap_tool_result from bub.runtime import AsyncStreamEvents, BubError, ErrorKind, StreamEvent, StreamState from bub.tools import Tool, ToolContext, ToolExecutor CONTEXT_LENGTH_PATTERNS = re.compile( - r"context.{0,20}(?:length|window)|maximum.{0,20}context|token.{0,10}limit|prompt.{0,10}too long|tokens? > \d+ maximum", + r"context.{0,20}(?:length|window)|maximum.{0,20}context|token.{0,10}limit|prompt.{0,10}too long|tokens? > \d+ maximum|request entity too large", re.IGNORECASE, ) TOOL_ARGUMENTS_ADAPTER = TypeAdapter(dict[str, Any]) @@ -130,6 +131,7 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]: tool_invocations, context=context, ) + tool_results = self._cap_tool_results(execution.tool_results, run_id=run_id) await self.record_chat( tape=tape, run_id=run_id, @@ -137,14 +139,14 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]: new_messages=new_messages, response_text=None, tool_calls=serialized_tool_calls, - tool_results=execution.tool_results, + tool_results=tool_results, response=output.response, model=model, usage=state.usage, ) - yield StreamEvent("tool_result", {"tool_results": execution.tool_results}) + yield StreamEvent("tool_result", {"tool_results": tool_results}) yield StreamEvent( - "final", {"ok": True, "tool_calls": serialized_tool_calls, "tool_results": execution.tool_results} + "final", {"ok": True, "tool_calls": serialized_tool_calls, "tool_results": tool_results} ) return @@ -163,6 +165,21 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]: return AsyncStreamEvents(iterator(), state=state) + def _cap_tool_results(self, results: list[Any], *, run_id: str) -> list[Any]: + """Bound each tool result so a single oversized one cannot trigger a 413. + + Runs once per tool execution (before the results hit the tape, the trace, + or the next request), spilling overflow to ``/tool-output``. + """ + import bub + + limit = self.settings.max_tool_result_bytes + spill_dir = bub.home / "tool-output" + return [ + cap_tool_result(result, run_id=run_id, index=index, limit=limit, spill_dir=spill_dir) + for index, result in enumerate(results) + ] + @staticmethod def generate_run_id() -> str: return f"run-{datetime.now(UTC).strftime('%Y%m%dT%H%M%S%fZ')}" diff --git a/src/bub/builtin/settings.py b/src/bub/builtin/settings.py index 49d88389..f301b9ee 100644 --- a/src/bub/builtin/settings.py +++ b/src/bub/builtin/settings.py @@ -13,6 +13,7 @@ from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict from bub import Settings, config, ensure_config +from bub.builtin.tool_output import DEFAULT_MAX_TOOL_RESULT_BYTES DEFAULT_MODEL = "openrouter:openrouter/free" DEFAULT_MAX_TOKENS = 16384 @@ -58,6 +59,14 @@ class AgentSettings(Settings): api_base: str | dict[str, str] | None = None max_steps: int = 50 max_tokens: int = DEFAULT_MAX_TOKENS + max_tool_result_bytes: int = Field( + default=DEFAULT_MAX_TOOL_RESULT_BYTES, + description=( + "Max UTF-8 bytes of a single tool result kept inline. Larger results are " + "truncated and spilled to /tool-output to avoid 413 errors. " + "Set to 0 to disable." + ), + ) model_timeout_seconds: int | None = None client_args: dict[str, Any] = Field(default_factory=dict) verbose: int = Field(default=0, description="Verbosity level for logging. Higher means more verbose.", ge=0, le=2) diff --git a/src/bub/builtin/tool_output.py b/src/bub/builtin/tool_output.py new file mode 100644 index 00000000..812e2849 --- /dev/null +++ b/src/bub/builtin/tool_output.py @@ -0,0 +1,76 @@ +"""Bound tool-result size before it reaches the tape and the next model request. + +A single oversized tool result -- e.g. ``grep`` over a minified bundle or source +map under ``node_modules`` -- can blow past the provider's request-body limit and +fail the whole turn with a ``413 Request Entity Too Large``. ``head -50`` is no +protection when individual lines are megabytes long. + +We cap each result at a byte budget and spill the full output to a file, so the +agent keeps a bounded inline view *and* the complete output for follow-up +inspection with ``tail`` / ``rg``. The cap is enforced at the result boundary, so +it protects the tape, the trace, the streamed event, and the next request alike. +""" + +from __future__ import annotations + +import shlex +from pathlib import Path + +DEFAULT_MAX_TOOL_RESULT_BYTES = 128 * 1024 + + +def _human_bytes(num_bytes: int) -> str: + size = float(num_bytes) + for unit in ("B", "KB", "MB", "GB", "TB"): + if size < 1024: + return f"{int(size)} {unit}" if unit == "B" else f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} PB" + + +def _spill_to_file(data: bytes, *, run_id: str, index: int, spill_dir: Path) -> Path: + spill_dir.mkdir(parents=True, exist_ok=True) + path = spill_dir / f"{run_id}-call-{index}.txt" + path.write_bytes(data) + return path + + +def _truncation_footer(*, original_bytes: int, limit: int, spill_path: Path) -> str: + quoted = shlex.quote(str(spill_path)) + return ( + f"\n\n[output truncated: original {_human_bytes(original_bytes)} " + f"exceeded {_human_bytes(limit)} limit]\n" + f"[full output saved to: {spill_path}]\n" + f"[hint: inspect the end with `tail -c 4096 {quoted}` " + f"or search it with `rg {quoted}`]" + ) + + +def cap_tool_result( + result: object, + *, + run_id: str, + index: int, + limit: int, + spill_dir: Path, +) -> object: + """Bound a single tool result so it cannot blow past the request-body limit. + + Oversized string results are truncated to ``limit`` UTF-8 bytes (the footer is + counted against the budget, so the returned string stays within ``limit``); the + full output is written to ``spill_dir`` and the truncated text gains a footer + pointing the agent at the file. Non-string results, results within budget, and + non-positive limits pass through unchanged. + """ + if limit <= 0 or not isinstance(result, str): + return result + encoded = result.encode("utf-8") + if len(encoded) <= limit: + return result + + spill_path = _spill_to_file(encoded, run_id=run_id, index=index, spill_dir=spill_dir) + footer = _truncation_footer(original_bytes=len(encoded), limit=limit, spill_path=spill_path) + head_budget = max(0, limit - len(footer.encode("utf-8"))) + # errors="ignore" drops a partial multi-byte char left at the cut boundary. + head = encoded[:head_budget].decode("utf-8", errors="ignore") + return head + footer diff --git a/tests/test_tool_output_cap.py b/tests/test_tool_output_cap.py new file mode 100644 index 00000000..e0af9bce --- /dev/null +++ b/tests/test_tool_output_cap.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import re +from collections.abc import AsyncIterator, Iterator +from pathlib import Path +from typing import Any + +import pytest +from any_llm.constants import LLMProvider +from any_llm.providers.openai.base import BaseOpenAIProvider +from any_llm.types.completion import ChatCompletionChunk + +from bub.builtin.context import default_tape_context +from bub.builtin.model_runner import ModelRunner, is_context_length_error +from bub.builtin.settings import AgentSettings, ModelCandidate +from bub.builtin.tape import Tape +from bub.builtin.tool_output import cap_tool_result +from bub.tape import AsyncTapeStoreAdapter, InMemoryTapeStore +from bub.tools import Tool + +SPILL_PATH_RE = re.compile(r"\[full output saved to: (?P.+)\]") + + +def test_cap_tool_result_passes_through_small_and_non_string(tmp_path: Path) -> None: + assert cap_tool_result("small", run_id="r", index=0, limit=1024, spill_dir=tmp_path) == "small" + assert cap_tool_result({"k": "v"}, run_id="r", index=0, limit=8, spill_dir=tmp_path) == {"k": "v"} + big = "x" * 5000 + # limit <= 0 disables capping entirely. + assert cap_tool_result(big, run_id="r", index=0, limit=0, spill_dir=tmp_path) == big + assert not list(tmp_path.iterdir()) + + +def test_cap_tool_result_truncates_and_spills_full_output(tmp_path: Path) -> None: + original = "A" * 8000 # single oversized line, like a minified bundle + limit = 1024 + + capped = cap_tool_result(original, run_id="run-abc", index=2, limit=limit, spill_dir=tmp_path) + + assert isinstance(capped, str) + assert len(capped.encode("utf-8")) <= limit + assert "[output truncated:" in capped + match = SPILL_PATH_RE.search(capped) + assert match is not None + spill_path = Path(match.group("path")) + assert spill_path.read_text(encoding="utf-8") == original + + +def test_cap_tool_result_does_not_split_multibyte_chars(tmp_path: Path) -> None: + original = "你" * 4000 # 3 bytes each in UTF-8 + capped = cap_tool_result(original, run_id="run-utf8", index=0, limit=1024, spill_dir=tmp_path) + + assert isinstance(capped, str) + # Decoding already happened without raising; the head must be valid UTF-8. + assert len(capped.encode("utf-8")) <= 1024 + match = SPILL_PATH_RE.search(capped) + assert match is not None + assert Path(match.group("path")).read_text(encoding="utf-8") == original + + +class _FakeToolCallProvider(BaseOpenAIProvider): + SUPPORTS_COMPLETION_STREAMING = True + + def __init__(self, tool_name: str) -> None: + self._tool_name = tool_name + + async def acompletion(self, **_kwargs: Any) -> AsyncIterator[ChatCompletionChunk]: + async def stream() -> AsyncIterator[ChatCompletionChunk]: + yield ChatCompletionChunk.model_validate({ + "id": "chatcmpl_test", + "object": "chat.completion.chunk", + "created": 0, + "model": "gpt-test", + "choices": [ + { + "index": 0, + "finish_reason": "tool_calls", + "delta": { + "role": "assistant", + "tool_calls": [ + { + "index": 0, + "id": "call_1", + "type": "function", + "function": {"name": self._tool_name, "arguments": "{}"}, + } + ], + }, + } + ], + }) + + return stream() + + +class _FakeOpenAIModelRunner(ModelRunner): + def __init__(self, settings: AgentSettings, llm: _FakeToolCallProvider) -> None: + super().__init__(settings) + self._llm = llm + + def iter_llm_clients(self, model: str) -> Iterator[tuple[ModelCandidate, _FakeToolCallProvider]]: + yield ModelCandidate(provider=LLMProvider.OPENAI, model_id=model, name=f"openai:{model}"), self._llm + + +@pytest.mark.asyncio +async def test_oversized_tool_result_is_bounded_in_next_request(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("BUB_HOME", str(tmp_path / "home")) + limit = 1024 + original = "B" * (4 * 1024 * 1024) # 4 MB single-line output + + def bigtool() -> str: + return original + + tool = Tool.from_callable(bigtool, name="bigtool") + store = InMemoryTapeStore() + tape = Tape(tmp_path, AsyncTapeStoreAdapter(store), default_tape_context()).scoped("test-tape") + runner = _FakeOpenAIModelRunner( + AgentSettings.model_construct( + model="openai:gpt-test", max_tokens=100, model_timeout_seconds=None, max_tool_result_bytes=limit + ), + _FakeToolCallProvider(tool_name="bigtool"), + ) + + await tape.ensure_bootstrap_anchor() + events = [ + event async for event in runner.run(tape=tape, model="gpt-test", tools=[tool], system_prompt=None, prompt="go") + ] + + tool_result_events = [event for event in events if event.kind == "tool_result"] + assert len(tool_result_events) == 1 + streamed = tool_result_events[0].data["tool_results"][0] + assert len(streamed.encode("utf-8")) <= limit + + # The reconstructed next-turn request must carry the bounded result, not 4 MB. + messages = await tape.read_messages() + tool_messages = [message for message in messages if message.get("role") == "tool"] + assert len(tool_messages) == 1 + content = tool_messages[0]["content"] + assert len(content.encode("utf-8")) <= limit + assert "[output truncated:" in content + + match = SPILL_PATH_RE.search(content) + assert match is not None + spill_path = Path(match.group("path")) + assert spill_path.read_text(encoding="utf-8") == original + + +def test_413_is_treated_as_a_context_overflow_for_auto_handoff() -> None: + body = "413 Request Entity Too Large" + assert is_context_length_error(body) is True diff --git a/website/src/content/docs/docs/reference/settings.mdx b/website/src/content/docs/docs/reference/settings.mdx index df24fbc4..73d3eafe 100644 --- a/website/src/content/docs/docs/reference/settings.mdx +++ b/website/src/content/docs/docs/reference/settings.mdx @@ -47,6 +47,7 @@ class AgentSettings(Settings): api_format: Literal["completion", "responses", "messages"] = "completion" max_steps: int = 50 max_tokens: int = DEFAULT_MAX_TOKENS # 16384 + max_tool_result_bytes: int = DEFAULT_MAX_TOOL_RESULT_BYTES # 131072 model_timeout_seconds: int | None = None client_args: dict[str, Any] | None = None verbose: int = Field(default=0, ge=0, le=2) @@ -65,6 +66,7 @@ Loaded under the YAML root section. | `BUB_API_FORMAT` | `completion` | `api_format` | One of `completion`, `responses`, `messages`. | | `BUB_MAX_STEPS` | `50` | `max_steps` | Maximum agent loop iterations per turn. | | `BUB_MAX_TOKENS` | `16384` | `max_tokens` | Maximum tokens per model call. | +| `BUB_MAX_TOOL_RESULT_BYTES` | `131072` | `max_tool_result_bytes` | Max UTF-8 bytes of a single tool result kept inline. Larger output is truncated and spilled to `/tool-output` to avoid `413` errors. `0` disables. | | `BUB_MODEL_TIMEOUT_SECONDS` | `null` | `model_timeout_seconds` | Per-call timeout in seconds. | | `BUB_CLIENT_ARGS` | `null` | `client_args` | Extra kwargs passed to the underlying model client (JSON / dict). | | `BUB_VERBOSE` | `0` | `verbose` | Logging verbosity level (`0`–`2`). | diff --git a/website/src/content/docs/zh-cn/docs/reference/settings.mdx b/website/src/content/docs/zh-cn/docs/reference/settings.mdx index 6261d8c1..4799db63 100644 --- a/website/src/content/docs/zh-cn/docs/reference/settings.mdx +++ b/website/src/content/docs/zh-cn/docs/reference/settings.mdx @@ -47,6 +47,7 @@ class AgentSettings(Settings): api_format: Literal["completion", "responses", "messages"] = "completion" max_steps: int = 50 max_tokens: int = DEFAULT_MAX_TOKENS # 16384 + max_tool_result_bytes: int = DEFAULT_MAX_TOOL_RESULT_BYTES # 131072 model_timeout_seconds: int | None = None client_args: dict[str, Any] | None = None verbose: int = Field(default=0, ge=0, le=2) @@ -65,6 +66,7 @@ class AgentSettings(Settings): | `BUB_API_FORMAT` | `completion` | `api_format` | `completion`、`responses`、`messages` 之一。 | | `BUB_MAX_STEPS` | `50` | `max_steps` | 单次 turn 内 agent 循环的最大步数。 | | `BUB_MAX_TOKENS` | `16384` | `max_tokens` | 单次模型调用的最大 token 数。 | +| `BUB_MAX_TOOL_RESULT_BYTES` | `131072` | `max_tool_result_bytes` | 单个工具结果保留为 inline 的最大 UTF-8 字节数。超出部分会被截断并落盘到 `/tool-output`,避免触发 `413`。设为 `0` 关闭。 | | `BUB_MODEL_TIMEOUT_SECONDS` | `null` | `model_timeout_seconds` | 单次调用的超时秒数。 | | `BUB_CLIENT_ARGS` | `null` | `client_args` | 传递给底层模型 client 的额外 kwargs(JSON / dict)。 | | `BUB_VERBOSE` | `0` | `verbose` | 日志详细级别(`0`–`2`)。 |