diff --git a/env.example b/env.example
index 191a8ab9..c738072b 100644
--- a/env.example
+++ b/env.example
@@ -9,6 +9,10 @@
 # BUB_MODEL=openrouter:openrouter/free
 # BUB_MAX_STEPS=50
 # BUB_MAX_TOKENS=16384
+# Max UTF-8 bytes of a single tool result kept inline; larger output is
+# truncated and spilled to <bub.home>/tool-output (0 disables). Guards against
+# 413 Request Entity Too Large from oversized tool output.
+# BUB_MAX_TOOL_RESULT_BYTES=131072
 # BUB_MODEL_TIMEOUT_SECONDS=300
 # BUB_HOME=~/.bub
 
diff --git a/src/bub/builtin/model_runner.py b/src/bub/builtin/model_runner.py
index 2664b964..6a552b14 100644
--- a/src/bub/builtin/model_runner.py
+++ b/src/bub/builtin/model_runner.py
@@ -26,11 +26,12 @@
 
 from bub.builtin.settings import AgentSettings, ModelCandidate
 from bub.builtin.tape import Tape
+from bub.builtin.tool_output import cap_tool_result
 from bub.runtime import AsyncStreamEvents, BubError, ErrorKind, StreamEvent, StreamState
 from bub.tools import Tool, ToolContext, ToolExecutor
 
 CONTEXT_LENGTH_PATTERNS = re.compile(
-    r"context.{0,20}(?:length|window)|maximum.{0,20}context|token.{0,10}limit|prompt.{0,10}too long|tokens? > \d+ maximum",
+    r"context.{0,20}(?:length|window)|maximum.{0,20}context|token.{0,10}limit|prompt.{0,10}too long|tokens? > \d+ maximum|request entity too large",
     re.IGNORECASE,
 )
 TOOL_ARGUMENTS_ADAPTER = TypeAdapter(dict[str, Any])
@@ -130,6 +131,7 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]:
                     tool_invocations,
                     context=context,
                 )
+                tool_results = self._cap_tool_results(execution.tool_results, run_id=run_id)
                 await self.record_chat(
                     tape=tape,
                     run_id=run_id,
@@ -137,14 +139,14 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]:
                     new_messages=new_messages,
                     response_text=None,
                     tool_calls=serialized_tool_calls,
-                    tool_results=execution.tool_results,
+                    tool_results=tool_results,
                     response=output.response,
                     model=model,
                     usage=state.usage,
                 )
-                yield StreamEvent("tool_result", {"tool_results": execution.tool_results})
+                yield StreamEvent("tool_result", {"tool_results": tool_results})
                 yield StreamEvent(
-                    "final", {"ok": True, "tool_calls": serialized_tool_calls, "tool_results": execution.tool_results}
+                    "final", {"ok": True, "tool_calls": serialized_tool_calls, "tool_results": tool_results}
                 )
                 return
 
@@ -163,6 +165,21 @@ async def iterator() -> AsyncGenerator[StreamEvent, None]:
 
         return AsyncStreamEvents(iterator(), state=state)
 
+    def _cap_tool_results(self, results: list[Any], *, run_id: str) -> list[Any]:
+        """Bound each tool result so a single oversized one cannot trigger a 413.
+
+        Runs once per tool execution (before the results hit the tape, the trace,
+        or the next request), spilling overflow to ``<bub.home>/tool-output``.
+        """
+        import bub
+
+        limit = self.settings.max_tool_result_bytes
+        spill_dir = bub.home / "tool-output"
+        return [
+            cap_tool_result(result, run_id=run_id, index=index, limit=limit, spill_dir=spill_dir)
+            for index, result in enumerate(results)
+        ]
+
     @staticmethod
     def generate_run_id() -> str:
         return f"run-{datetime.now(UTC).strftime('%Y%m%dT%H%M%S%fZ')}"
diff --git a/src/bub/builtin/settings.py b/src/bub/builtin/settings.py
index 49d88389..f301b9ee 100644
--- a/src/bub/builtin/settings.py
+++ b/src/bub/builtin/settings.py
@@ -13,6 +13,7 @@
 from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
 
 from bub import Settings, config, ensure_config
+from bub.builtin.tool_output import DEFAULT_MAX_TOOL_RESULT_BYTES
 
 DEFAULT_MODEL = "openrouter:openrouter/free"
 DEFAULT_MAX_TOKENS = 16384
@@ -58,6 +59,14 @@ class AgentSettings(Settings):
     api_base: str | dict[str, str] | None = None
     max_steps: int = 50
     max_tokens: int = DEFAULT_MAX_TOKENS
+    max_tool_result_bytes: int = Field(
+        default=DEFAULT_MAX_TOOL_RESULT_BYTES,
+        description=(
+            "Max UTF-8 bytes of a single tool result kept inline. Larger results are "
+            "truncated and spilled to <bub.home>/tool-output to avoid 413 errors. "
+            "Set to 0 to disable."
+        ),
+    )
     model_timeout_seconds: int | None = None
     client_args: dict[str, Any] = Field(default_factory=dict)
     verbose: int = Field(default=0, description="Verbosity level for logging. Higher means more verbose.", ge=0, le=2)
diff --git a/src/bub/builtin/tool_output.py b/src/bub/builtin/tool_output.py
new file mode 100644
index 00000000..812e2849
--- /dev/null
+++ b/src/bub/builtin/tool_output.py
@@ -0,0 +1,76 @@
+"""Bound tool-result size before it reaches the tape and the next model request.
+
+A single oversized tool result -- e.g. ``grep`` over a minified bundle or source
+map under ``node_modules`` -- can blow past the provider's request-body limit and
+fail the whole turn with a ``413 Request Entity Too Large``. ``head -50`` is no
+protection when individual lines are megabytes long.
+
+We cap each result at a byte budget and spill the full output to a file, so the
+agent keeps a bounded inline view *and* the complete output for follow-up
+inspection with ``tail`` / ``rg``. The cap is enforced at the result boundary, so
+it protects the tape, the trace, the streamed event, and the next request alike.
+"""
+
+from __future__ import annotations
+
+import shlex
+from pathlib import Path
+
+DEFAULT_MAX_TOOL_RESULT_BYTES = 128 * 1024
+
+
+def _human_bytes(num_bytes: int) -> str:
+    size = float(num_bytes)
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if size < 1024:
+            return f"{int(size)} {unit}" if unit == "B" else f"{size:.1f} {unit}"
+        size /= 1024
+    return f"{size:.1f} PB"
+
+
+def _spill_to_file(data: bytes, *, run_id: str, index: int, spill_dir: Path) -> Path:
+    spill_dir.mkdir(parents=True, exist_ok=True)
+    path = spill_dir / f"{run_id}-call-{index}.txt"
+    path.write_bytes(data)
+    return path
+
+
+def _truncation_footer(*, original_bytes: int, limit: int, spill_path: Path) -> str:
+    quoted = shlex.quote(str(spill_path))
+    return (
+        f"\n\n[output truncated: original {_human_bytes(original_bytes)} "
+        f"exceeded {_human_bytes(limit)} limit]\n"
+        f"[full output saved to: {spill_path}]\n"
+        f"[hint: inspect the end with `tail -c 4096 {quoted}` "
+        f"or search it with `rg <pattern> {quoted}`]"
+    )
+
+
+def cap_tool_result(
+    result: object,
+    *,
+    run_id: str,
+    index: int,
+    limit: int,
+    spill_dir: Path,
+) -> object:
+    """Bound a single tool result so it cannot blow past the request-body limit.
+
+    Oversized string results are truncated to ``limit`` UTF-8 bytes (the footer is
+    counted against the budget, so the returned string stays within ``limit``); the
+    full output is written to ``spill_dir`` and the truncated text gains a footer
+    pointing the agent at the file. Non-string results, results within budget, and
+    non-positive limits pass through unchanged.
+    """
+    if limit <= 0 or not isinstance(result, str):
+        return result
+    encoded = result.encode("utf-8")
+    if len(encoded) <= limit:
+        return result
+
+    spill_path = _spill_to_file(encoded, run_id=run_id, index=index, spill_dir=spill_dir)
+    footer = _truncation_footer(original_bytes=len(encoded), limit=limit, spill_path=spill_path)
+    head_budget = max(0, limit - len(footer.encode("utf-8")))
+    # errors="ignore" drops a partial multi-byte char left at the cut boundary.
+    head = encoded[:head_budget].decode("utf-8", errors="ignore")
+    return head + footer
diff --git a/tests/test_tool_output_cap.py b/tests/test_tool_output_cap.py
new file mode 100644
index 00000000..e0af9bce
--- /dev/null
+++ b/tests/test_tool_output_cap.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+import re
+from collections.abc import AsyncIterator, Iterator
+from pathlib import Path
+from typing import Any
+
+import pytest
+from any_llm.constants import LLMProvider
+from any_llm.providers.openai.base import BaseOpenAIProvider
+from any_llm.types.completion import ChatCompletionChunk
+
+from bub.builtin.context import default_tape_context
+from bub.builtin.model_runner import ModelRunner, is_context_length_error
+from bub.builtin.settings import AgentSettings, ModelCandidate
+from bub.builtin.tape import Tape
+from bub.builtin.tool_output import cap_tool_result
+from bub.tape import AsyncTapeStoreAdapter, InMemoryTapeStore
+from bub.tools import Tool
+
+SPILL_PATH_RE = re.compile(r"\[full output saved to: (?P<path>.+)\]")
+
+
+def test_cap_tool_result_passes_through_small_and_non_string(tmp_path: Path) -> None:
+    assert cap_tool_result("small", run_id="r", index=0, limit=1024, spill_dir=tmp_path) == "small"
+    assert cap_tool_result({"k": "v"}, run_id="r", index=0, limit=8, spill_dir=tmp_path) == {"k": "v"}
+    big = "x" * 5000
+    # limit <= 0 disables capping entirely.
+    assert cap_tool_result(big, run_id="r", index=0, limit=0, spill_dir=tmp_path) == big
+    assert not list(tmp_path.iterdir())
+
+
+def test_cap_tool_result_truncates_and_spills_full_output(tmp_path: Path) -> None:
+    original = "A" * 8000  # single oversized line, like a minified bundle
+    limit = 1024
+
+    capped = cap_tool_result(original, run_id="run-abc", index=2, limit=limit, spill_dir=tmp_path)
+
+    assert isinstance(capped, str)
+    assert len(capped.encode("utf-8")) <= limit
+    assert "[output truncated:" in capped
+    match = SPILL_PATH_RE.search(capped)
+    assert match is not None
+    spill_path = Path(match.group("path"))
+    assert spill_path.read_text(encoding="utf-8") == original
+
+
+def test_cap_tool_result_does_not_split_multibyte_chars(tmp_path: Path) -> None:
+    original = "你" * 4000  # 3 bytes each in UTF-8
+    capped = cap_tool_result(original, run_id="run-utf8", index=0, limit=1024, spill_dir=tmp_path)
+
+    assert isinstance(capped, str)
+    # Decoding already happened without raising; the head must be valid UTF-8.
+    assert len(capped.encode("utf-8")) <= 1024
+    match = SPILL_PATH_RE.search(capped)
+    assert match is not None
+    assert Path(match.group("path")).read_text(encoding="utf-8") == original
+
+
+class _FakeToolCallProvider(BaseOpenAIProvider):
+    SUPPORTS_COMPLETION_STREAMING = True
+
+    def __init__(self, tool_name: str) -> None:
+        self._tool_name = tool_name
+
+    async def acompletion(self, **_kwargs: Any) -> AsyncIterator[ChatCompletionChunk]:
+        async def stream() -> AsyncIterator[ChatCompletionChunk]:
+            yield ChatCompletionChunk.model_validate({
+                "id": "chatcmpl_test",
+                "object": "chat.completion.chunk",
+                "created": 0,
+                "model": "gpt-test",
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls",
+                        "delta": {
+                            "role": "assistant",
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": "call_1",
+                                    "type": "function",
+                                    "function": {"name": self._tool_name, "arguments": "{}"},
+                                }
+                            ],
+                        },
+                    }
+                ],
+            })
+
+        return stream()
+
+
+class _FakeOpenAIModelRunner(ModelRunner):
+    def __init__(self, settings: AgentSettings, llm: _FakeToolCallProvider) -> None:
+        super().__init__(settings)
+        self._llm = llm
+
+    def iter_llm_clients(self, model: str) -> Iterator[tuple[ModelCandidate, _FakeToolCallProvider]]:
+        yield ModelCandidate(provider=LLMProvider.OPENAI, model_id=model, name=f"openai:{model}"), self._llm
+
+
+@pytest.mark.asyncio
+async def test_oversized_tool_result_is_bounded_in_next_request(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("BUB_HOME", str(tmp_path / "home"))
+    limit = 1024
+    original = "B" * (4 * 1024 * 1024)  # 4 MB single-line output
+
+    def bigtool() -> str:
+        return original
+
+    tool = Tool.from_callable(bigtool, name="bigtool")
+    store = InMemoryTapeStore()
+    tape = Tape(tmp_path, AsyncTapeStoreAdapter(store), default_tape_context()).scoped("test-tape")
+    runner = _FakeOpenAIModelRunner(
+        AgentSettings.model_construct(
+            model="openai:gpt-test", max_tokens=100, model_timeout_seconds=None, max_tool_result_bytes=limit
+        ),
+        _FakeToolCallProvider(tool_name="bigtool"),
+    )
+
+    await tape.ensure_bootstrap_anchor()
+    events = [
+        event async for event in runner.run(tape=tape, model="gpt-test", tools=[tool], system_prompt=None, prompt="go")
+    ]
+
+    tool_result_events = [event for event in events if event.kind == "tool_result"]
+    assert len(tool_result_events) == 1
+    streamed = tool_result_events[0].data["tool_results"][0]
+    assert len(streamed.encode("utf-8")) <= limit
+
+    # The reconstructed next-turn request must carry the bounded result, not 4 MB.
+    messages = await tape.read_messages()
+    tool_messages = [message for message in messages if message.get("role") == "tool"]
+    assert len(tool_messages) == 1
+    content = tool_messages[0]["content"]
+    assert len(content.encode("utf-8")) <= limit
+    assert "[output truncated:" in content
+
+    match = SPILL_PATH_RE.search(content)
+    assert match is not None
+    spill_path = Path(match.group("path"))
+    assert spill_path.read_text(encoding="utf-8") == original
+
+
+def test_413_is_treated_as_a_context_overflow_for_auto_handoff() -> None:
+    body = "<html><title>413 Request Entity Too Large</title></html>"
+    assert is_context_length_error(body) is True
diff --git a/website/src/content/docs/docs/reference/settings.mdx b/website/src/content/docs/docs/reference/settings.mdx
index df24fbc4..73d3eafe 100644
--- a/website/src/content/docs/docs/reference/settings.mdx
+++ b/website/src/content/docs/docs/reference/settings.mdx
@@ -47,6 +47,7 @@ class AgentSettings(Settings):
     api_format: Literal["completion", "responses", "messages"] = "completion"
     max_steps: int = 50
     max_tokens: int = DEFAULT_MAX_TOKENS  # 16384
+    max_tool_result_bytes: int = DEFAULT_MAX_TOOL_RESULT_BYTES  # 131072
     model_timeout_seconds: int | None = None
     client_args: dict[str, Any] | None = None
     verbose: int = Field(default=0, ge=0, le=2)
@@ -65,6 +66,7 @@ Loaded under the YAML root section.
 | `BUB_API_FORMAT` | `completion` | `api_format` | One of `completion`, `responses`, `messages`. |
 | `BUB_MAX_STEPS` | `50` | `max_steps` | Maximum agent loop iterations per turn. |
 | `BUB_MAX_TOKENS` | `16384` | `max_tokens` | Maximum tokens per model call. |
+| `BUB_MAX_TOOL_RESULT_BYTES` | `131072` | `max_tool_result_bytes` | Max UTF-8 bytes of a single tool result kept inline. Larger output is truncated and spilled to `<bub.home>/tool-output` to avoid `413` errors. `0` disables. |
 | `BUB_MODEL_TIMEOUT_SECONDS` | `null` | `model_timeout_seconds` | Per-call timeout in seconds. |
 | `BUB_CLIENT_ARGS` | `null` | `client_args` | Extra kwargs passed to the underlying model client (JSON / dict). |
 | `BUB_VERBOSE` | `0` | `verbose` | Logging verbosity level (`0`–`2`). |
diff --git a/website/src/content/docs/zh-cn/docs/reference/settings.mdx b/website/src/content/docs/zh-cn/docs/reference/settings.mdx
index 6261d8c1..4799db63 100644
--- a/website/src/content/docs/zh-cn/docs/reference/settings.mdx
+++ b/website/src/content/docs/zh-cn/docs/reference/settings.mdx
@@ -47,6 +47,7 @@ class AgentSettings(Settings):
     api_format: Literal["completion", "responses", "messages"] = "completion"
     max_steps: int = 50
     max_tokens: int = DEFAULT_MAX_TOKENS  # 16384
+    max_tool_result_bytes: int = DEFAULT_MAX_TOOL_RESULT_BYTES  # 131072
     model_timeout_seconds: int | None = None
     client_args: dict[str, Any] | None = None
     verbose: int = Field(default=0, ge=0, le=2)
@@ -65,6 +66,7 @@ class AgentSettings(Settings):
 | `BUB_API_FORMAT` | `completion` | `api_format` | `completion`、`responses`、`messages` 之一。 |
 | `BUB_MAX_STEPS` | `50` | `max_steps` | 单次 turn 内 agent 循环的最大步数。 |
 | `BUB_MAX_TOKENS` | `16384` | `max_tokens` | 单次模型调用的最大 token 数。 |
+| `BUB_MAX_TOOL_RESULT_BYTES` | `131072` | `max_tool_result_bytes` | 单个工具结果保留为 inline 的最大 UTF-8 字节数。超出部分会被截断并落盘到 `<bub.home>/tool-output`,避免触发 `413`。设为 `0` 关闭。 |
 | `BUB_MODEL_TIMEOUT_SECONDS` | `null` | `model_timeout_seconds` | 单次调用的超时秒数。 |
 | `BUB_CLIENT_ARGS` | `null` | `client_args` | 传递给底层模型 client 的额外 kwargs(JSON / dict)。 |
 | `BUB_VERBOSE` | `0` | `verbose` | 日志详细级别(`0`–`2`)。 |