Skip to content

Commit 69ad22e

Browse files
fix(playwright): Filter unsupported context options in persistent browser (#1796)
This PR fixes issue #1784, where PlaywrightCrawler would crash when passing context options (like storage_state) that are unsupported by Playwright's launch_persistent_context method. Closes #1784 --------- Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
1 parent 02a18ea commit 69ad22e

2 files changed

Lines changed: 111 additions & 7 deletions

File tree

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33
from __future__ import annotations
44

5+
import inspect
56
from asyncio import Lock
67
from datetime import datetime, timedelta, timezone
8+
from functools import lru_cache
79
from typing import TYPE_CHECKING, Any, cast
810

911
from browserforge.injectors.playwright import AsyncNewContext
1012
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
13+
from playwright.async_api import BrowserType as PlaywrightBrowserType
1114
from typing_extensions import override
1215

1316
from crawlee._utils.docs import docs_group
@@ -28,6 +31,18 @@
2831
logger = getLogger(__name__)
2932

3033

34+
# Cache Playwright signatures to avoid overhead in critical path
35+
@lru_cache(maxsize=1)
36+
def _get_context_params_cache() -> dict[str, set[str]]:
37+
launch_persistent_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
38+
new_context_params = set(inspect.signature(Browser.new_context).parameters)
39+
return {
40+
'common': launch_persistent_params & new_context_params,
41+
'persistent_unique': launch_persistent_params - new_context_params,
42+
'incognito_unique': new_context_params - launch_persistent_params,
43+
}
44+
45+
3146
@docs_group('Browser management')
3247
class PlaywrightBrowserController(BrowserController):
3348
"""Controller for managing Playwright browser instances and their pages.
@@ -209,6 +224,31 @@ def _on_page_close(self, page: Page) -> None:
209224
"""Handle actions after a page is closed."""
210225
self._pages.remove(page)
211226

227+
def _filter_context_options(self, options: dict[str, Any]) -> dict[str, Any]:
228+
"""Filter browser context options based on the current mode (incognito vs persistent).
229+
230+
Options that are valid only in the other mode are dropped with a warning. Unrecognized options are kept
231+
and passed through so that Playwright itself can raise an appropriate error.
232+
"""
233+
params_cache = _get_context_params_cache()
234+
filtered = dict[str, Any]()
235+
236+
for key, value in options.items():
237+
if self._use_incognito_pages and key in params_cache['persistent_unique']:
238+
logger.warning(
239+
f'Option "{key}" is only supported in persistent context mode '
240+
'(use_incognito_pages=False) and will be ignored.'
241+
)
242+
elif not self._use_incognito_pages and key in params_cache['incognito_unique']:
243+
logger.warning(
244+
f'Option "{key}" is only supported in incognito context mode '
245+
'(use_incognito_pages=True) and will be ignored.'
246+
)
247+
else:
248+
filtered[key] = value
249+
250+
return filtered
251+
212252
async def _create_browser_context(
213253
self,
214254
browser_new_context_options: Mapping[str, Any] | None = None,
@@ -222,11 +262,14 @@ async def _create_browser_context(
222262
`self._fingerprint_generator` is available.
223263
"""
224264
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
265+
266+
filtered_options = self._filter_context_options(browser_new_context_options)
267+
225268
if proxy_info:
226-
if browser_new_context_options.get('proxy'):
269+
if filtered_options.get('proxy'):
227270
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
228271

229-
browser_new_context_options['proxy'] = ProxySettings(
272+
filtered_options['proxy'] = ProxySettings(
230273
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
231274
username=proxy_info.username,
232275
password=proxy_info.password,
@@ -236,7 +279,7 @@ async def _create_browser_context(
236279
return await AsyncNewContext(
237280
browser=self._browser,
238281
fingerprint=self._fingerprint_generator.generate(),
239-
**browser_new_context_options,
282+
**filtered_options,
240283
)
241284

242285
if self._header_generator:
@@ -256,7 +299,5 @@ async def _create_browser_context(
256299
else:
257300
extra_http_headers = None
258301

259-
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
260-
'extra_http_headers', extra_http_headers
261-
)
262-
return await self._browser.new_context(**browser_new_context_options)
302+
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
303+
return await self._browser.new_context(**filtered_options)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from typing import TYPE_CHECKING
5+
6+
import pytest
7+
from playwright.async_api import Browser, Playwright, async_playwright
8+
9+
from crawlee.browsers import PlaywrightBrowserController
10+
11+
if TYPE_CHECKING:
12+
from collections.abc import AsyncGenerator
13+
14+
15+
@pytest.fixture
16+
async def playwright() -> AsyncGenerator[Playwright, None]:
17+
async with async_playwright() as playwright:
18+
yield playwright
19+
20+
21+
@pytest.fixture
22+
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
23+
browser = await playwright.chromium.launch()
24+
yield browser
25+
await browser.close()
26+
27+
28+
async def test_controller_validation_typo_passed_through(browser: Browser) -> None:
29+
"""Invalid options (e.g. typos) are passed through so Playwright raises its own error."""
30+
controller = PlaywrightBrowserController(browser)
31+
with pytest.raises(TypeError):
32+
await controller.new_page(browser_new_context_options={'headles': True})
33+
await controller.close()
34+
35+
36+
async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
37+
# Default is persistent mode (use_incognito_pages=False)
38+
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
39+
# storage_state is incognito-only
40+
with caplog.at_level(logging.WARNING):
41+
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
42+
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
43+
await page.close()
44+
await controller.close()
45+
46+
47+
async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
48+
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
49+
# env is persistent-only
50+
with caplog.at_level(logging.WARNING):
51+
page = await controller.new_page(browser_new_context_options={'env': {}})
52+
assert 'Option "env" is only supported in persistent context mode' in caplog.text
53+
await page.close()
54+
await controller.close()
55+
56+
57+
async def test_controller_validation_valid_common(browser: Browser) -> None:
58+
controller = PlaywrightBrowserController(browser)
59+
# viewport is common
60+
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
61+
assert page.viewport_size == {'width': 800, 'height': 600}
62+
await page.close()
63+
await controller.close()

0 commit comments

Comments
 (0)