Skip to content

Commit 610ae46

Browse files
committed
Address feedback
1 parent 78404c2 commit 610ae46

8 files changed

Lines changed: 146 additions & 61 deletions

File tree

src/crawlee/_utils/robots.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
if TYPE_CHECKING:
1414
from typing_extensions import Self
1515

16+
from crawlee._types import EnqueueStrategy
1617
from crawlee.http_clients import HttpClient
1718
from crawlee.proxy_configuration import ProxyInfo
1819

@@ -22,7 +23,11 @@
2223

2324
class RobotsTxtFile:
2425
def __init__(
25-
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
26+
self,
27+
url: str,
28+
robots: Protego,
29+
http_client: HttpClient | None = None,
30+
proxy_info: ProxyInfo | None = None,
2631
) -> None:
2732
self._robots = robots
2833
self._original_url = URL(url).origin()
@@ -90,18 +95,29 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
9095
return True
9196
return bool(self._robots.can_fetch(str(check_url), user_agent))
9297

93-
def get_sitemaps(self) -> list[str]:
94-
"""Get the list of same-host sitemap URLs from the robots.txt file."""
95-
same_host_sitemaps: list[str] = []
98+
def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
99+
"""Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.
100+
101+
Args:
102+
enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
103+
Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
104+
disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
105+
always filtered out.
106+
"""
107+
sitemaps: list[str] = []
96108
for sitemap_url in self._robots.sitemaps:
97-
if matches_enqueue_strategy('same-hostname', target_url=sitemap_url, origin_url=self._original_url):
98-
same_host_sitemaps.append(sitemap_url)
109+
if matches_enqueue_strategy(
110+
strategy=enqueue_strategy,
111+
target_url=sitemap_url,
112+
origin_url=self._original_url,
113+
):
114+
sitemaps.append(sitemap_url)
99115
else:
100116
logger.warning(
101117
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
102-
f'cross-host sitemap entries are not allowed by the robots.txt specification.'
118+
f'does not match enqueue strategy {enqueue_strategy!r}.'
103119
)
104-
return same_host_sitemaps
120+
return sitemaps
105121

106122
def get_crawl_delay(self, user_agent: str = '*') -> int | None:
107123
"""Get the crawl delay for the given user agent.
@@ -113,15 +129,23 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
113129
crawl_delay = self._robots.crawl_delay(user_agent)
114130
return int(crawl_delay) if crawl_delay is not None else None
115131

116-
async def parse_sitemaps(self) -> Sitemap:
117-
"""Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
118-
sitemaps = self.get_sitemaps()
132+
async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
133+
"""Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.
134+
135+
Args:
136+
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
137+
"""
138+
sitemaps = self.get_sitemaps(enqueue_strategy=enqueue_strategy)
119139
if not self._http_client:
120140
raise ValueError('HTTP client is required to parse sitemaps.')
121141

122142
return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
123143

124-
async def parse_urls_from_sitemaps(self) -> list[str]:
125-
"""Parse the sitemaps in the robots.txt file and return a list URLs."""
126-
sitemap = await self.parse_sitemaps()
144+
async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
145+
"""Parse the sitemaps in the robots.txt file and return a list URLs.
146+
147+
Args:
148+
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
149+
"""
150+
sitemap = await self.parse_sitemaps(enqueue_strategy=enqueue_strategy)
127151
return sitemap.urls

src/crawlee/_utils/sitemap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ def _check_and_add(url: str) -> bool:
546546

547547
# Try getting sitemaps from robots.txt first
548548
robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
549-
for sitemap_url in robots.get_sitemaps():
549+
for sitemap_url in robots.get_sitemaps(enqueue_strategy='same-hostname'):
550550
if _check_and_add(sitemap_url):
551551
yield sitemap_url
552552

src/crawlee/_utils/urls.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@
1616
from crawlee._types import EnqueueStrategy
1717

1818

19+
_ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
20+
"""URL schemes Crawlee accepts for fetching and enqueuing."""
21+
22+
_HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
23+
"""Pydantic validator for HTTP and HTTPS URLs."""
24+
25+
1926
def is_url_absolute(url: str) -> bool:
2027
"""Check if a URL is absolute."""
2128
url_parsed = URL(url)
@@ -44,30 +51,21 @@ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger
4451
yield converted_url
4552

4653

47-
_http_url_adapter = TypeAdapter(AnyHttpUrl)
48-
49-
5054
def validate_http_url(value: str | None) -> str | None:
5155
"""Validate the given HTTP URL.
5256
57+
Args:
58+
value: The URL to validate, or `None` to skip validation.
59+
5360
Raises:
54-
pydantic.ValidationError: If the URL is not valid.
61+
pydantic.ValidationError: If the URL is malformed or its scheme is not `http`/`https`.
5562
"""
5663
if value is not None:
57-
_http_url_adapter.validate_python(value)
64+
_HTTP_URL_ADAPTER.validate_python(value)
5865

5966
return value
6067

6168

62-
@lru_cache(maxsize=1)
63-
def _get_tld_extractor() -> TLDExtract:
64-
"""Return a lazily-initialized `TLDExtract` instance shared across the module."""
65-
# `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
66-
# is collected immediately when its return value is discarded, which would race the directory out from under
67-
# tldextract.
68-
return TLDExtract(cache_dir=tempfile.mkdtemp())
69-
70-
7169
def matches_enqueue_strategy(
7270
strategy: EnqueueStrategy,
7371
*,
@@ -76,6 +74,8 @@ def matches_enqueue_strategy(
7674
) -> bool:
7775
"""Check whether `target_url` matches `origin_url` under the given enqueue strategy.
7876
77+
Targets with non-http(s) schemes are always rejected, including under `strategy='all'`.
78+
7979
Args:
8080
strategy: The enqueue strategy to apply.
8181
target_url: The URL to be evaluated.
@@ -87,6 +87,9 @@ def matches_enqueue_strategy(
8787
target = URL(target_url) if isinstance(target_url, str) else target_url
8888
origin = URL(origin_url) if isinstance(origin_url, str) else origin_url
8989

90+
if target.scheme not in _ALLOWED_SCHEMES:
91+
return False
92+
9093
if strategy == 'all':
9194
return True
9295

@@ -97,12 +100,24 @@ def matches_enqueue_strategy(
97100
return target.host == origin.host
98101

99102
if strategy == 'same-domain':
100-
extractor = _get_tld_extractor()
101-
origin_domain = extractor.extract_str(origin.host).top_domain_under_public_suffix
102-
target_domain = extractor.extract_str(target.host).top_domain_under_public_suffix
103-
return origin_domain == target_domain
103+
return _domain_under_public_suffix(origin.host) == _domain_under_public_suffix(target.host)
104104

105105
if strategy == 'same-origin':
106106
return target.host == origin.host and target.scheme == origin.scheme and target.port == origin.port
107107

108108
assert_never(strategy)
109+
110+
111+
@lru_cache(maxsize=1)
112+
def _get_tld_extractor() -> TLDExtract:
113+
"""Return a lazily-initialized `TLDExtract` instance shared across the module."""
114+
# `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
115+
# is collected immediately when its return value is discarded, which would race the directory out from under
116+
# tldextract.
117+
return TLDExtract(cache_dir=tempfile.mkdtemp())
118+
119+
120+
@lru_cache(maxsize=2048)
121+
def _domain_under_public_suffix(host: str) -> str:
122+
"""Return the registrable domain for `host`, cached to avoid re-running the PSL lookup."""
123+
return _get_tld_extractor().extract_str(host).top_domain_under_public_suffix

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,7 @@ async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGe
977977
Filter out links that redirect outside of the crawled domain.
978978
"""
979979
if context.request.loaded_url is not None and not matches_enqueue_strategy(
980-
context.request.enqueue_strategy,
980+
strategy=context.request.enqueue_strategy,
981981
origin_url=URL(context.request.url),
982982
target_url=URL(context.request.loaded_url),
983983
):
@@ -1074,7 +1074,9 @@ def _enqueue_links_filter_iterator(
10741074
warning_flag = False
10751075

10761076
if matches_enqueue_strategy(
1077-
strategy, target_url=parsed_target_url, origin_url=parsed_origin_url
1077+
strategy=strategy,
1078+
target_url=parsed_target_url,
1079+
origin_url=parsed_origin_url,
10781080
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
10791081
yield request
10801082

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ def __init__(
129129
enqueue_strategy: Strategy used to decide which sitemap-derived URLs (both nested-sitemap entries and
130130
URL entries) are kept relative to the parent sitemap URL. Defaults to `'same-hostname'`, matching
131131
the sitemap protocol's same-host expectation and the `enqueue_links` default; pass `'all'` to
132-
disable filtering.
132+
disable filtering. Note: regardless of `enqueue_strategy`, entries with non-`http(s)` schemes are
133+
always filtered out.
133134
max_buffer_size: Maximum number of URLs to buffer in memory.
134135
http_client: the instance of `HttpClient` to use for fetching sitemaps.
135136
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
@@ -199,6 +200,18 @@ async def _get_state(self) -> SitemapRequestLoaderState:
199200

200201
return self._state.current_value
201202

203+
def _passes_strategy_filter(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
204+
"""Check whether `target` matches the loader's enqueue strategy relative to `parent`, logging if not."""
205+
if matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
206+
return True
207+
208+
logger.warning(
209+
f'Skipping {kind} {target!r}: does not match enqueue strategy '
210+
f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
211+
)
212+
213+
return False
214+
202215
def _check_url_patterns(
203216
self,
204217
target_url: str,
@@ -244,8 +257,6 @@ async def _load_sitemaps(self) -> None:
244257
state.in_progress_sitemap_url = sitemap_url
245258

246259
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
247-
# Parse the parent sitemap URL once per outer iteration; `matches_enqueue_strategy` is called per
248-
# entry below, and re-parsing the same string thousands of times for large sitemaps is wasteful.
249260
parsed_sitemap_url = URL(sitemap_url)
250261

251262
async for item in parse_sitemap(
@@ -257,13 +268,9 @@ async def _load_sitemaps(self) -> None:
257268
if isinstance(item, NestedSitemap):
258269
# Add nested sitemap to queue
259270
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
260-
if not matches_enqueue_strategy(
261-
self._enqueue_strategy, target_url=item.loc, origin_url=parsed_sitemap_url
271+
if not self._passes_strategy_filter(
272+
item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'
262273
):
263-
logger.warning(
264-
f'Skipping nested sitemap {item.loc!r}: does not match enqueue strategy '
265-
f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
266-
)
267274
continue
268275
state.pending_sitemap_urls.append(item.loc)
269276
continue
@@ -281,13 +288,7 @@ async def _load_sitemaps(self) -> None:
281288
if not self._check_url_patterns(url, self._include, self._exclude):
282289
continue
283290

284-
if not matches_enqueue_strategy(
285-
self._enqueue_strategy, target_url=url, origin_url=parsed_sitemap_url
286-
):
287-
logger.warning(
288-
f'Skipping sitemap URL {url!r}: does not match enqueue strategy '
289-
f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
290-
)
291+
if not self._passes_strategy_filter(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
291292
continue
292293

293294
# Check if we have capacity in the queue

tests/unit/_utils/test_robots.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,33 +26,49 @@ async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClien
2626

2727

2828
async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
29-
"""Cross-host sitemap entries are dropped from the test fixture's robots.txt."""
29+
"""Cross-host sitemap entries are dropped under the `'same-hostname'` enqueue strategy."""
3030
robots = await RobotsTxtFile.find(str(server_url), http_client)
31-
# The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url` and
32-
# therefore filtered out per the robots.txt specification.
33-
assert robots.get_sitemaps() == []
31+
# The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url`.
32+
assert robots.get_sitemaps(enqueue_strategy='same-hostname') == []
3433

3534

3635
async def test_extract_same_host_sitemaps_urls() -> None:
3736
"""Sitemap entries on the same host as the robots.txt are returned."""
3837
content = 'User-agent: *\nSitemap: http://example.com/sitemap_1.xml\nSitemap: http://example.com/sitemap_2.xml\n'
3938
robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
40-
assert set(robots.get_sitemaps()) == {
39+
assert set(robots.get_sitemaps(enqueue_strategy='same-hostname')) == {
4140
'http://example.com/sitemap_1.xml',
4241
'http://example.com/sitemap_2.xml',
4342
}
4443

4544

46-
async def test_extract_sitemaps_urls_filters_cross_host() -> None:
47-
"""Cross-host `Sitemap:` directives in robots.txt are silently filtered."""
45+
async def test_extract_sitemaps_urls_filters_cross_host_and_non_http() -> None:
46+
"""Cross-host and non-http(s) `Sitemap:` directives in robots.txt are silently filtered."""
4847
content = (
4948
'User-agent: *\n'
5049
'Sitemap: http://example.com/legit.xml\n'
5150
'Sitemap: http://other.test/payload.xml\n'
5251
'Sitemap: gopher://internal:6379/_PING\n'
52+
'Sitemap: ftp://example.com/payload.xml\n'
5353
)
5454
robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
55-
assert robots.get_sitemaps() == ['http://example.com/legit.xml']
55+
assert robots.get_sitemaps(enqueue_strategy='same-hostname') == ['http://example.com/legit.xml']
56+
57+
58+
async def test_get_sitemaps_with_strategy_all_returns_cross_host() -> None:
59+
"""`enqueue_strategy='all'` disables host filtering but still rejects non-http(s) schemes."""
60+
content = (
61+
'User-agent: *\n'
62+
'Sitemap: http://example.com/legit.xml\n'
63+
'Sitemap: http://other.test/payload.xml\n'
64+
'Sitemap: gopher://internal:6379/_PING\n'
65+
'Sitemap: ftp://example.com/payload.xml\n'
66+
)
67+
robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
68+
assert set(robots.get_sitemaps(enqueue_strategy='all')) == {
69+
'http://example.com/legit.xml',
70+
'http://other.test/payload.xml',
71+
}
5672

5773

5874
async def test_parse_from_content() -> None:

tests/unit/_utils/test_urls.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,22 @@ def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None:
7070
@pytest.mark.parametrize(
7171
('strategy', 'origin', 'target', 'expected'),
7272
[
73-
# 'all' lets everything through, even with empty/cross-host targets
73+
# 'all' lets http(s) through across hosts, but rejects non-http(s) schemes
7474
('all', 'https://example.com/', 'https://other.test/', True),
75-
('all', 'https://example.com/', 'gopher://internal:6379/_PING', True),
75+
('all', 'https://example.com/', 'gopher://internal:6379/_PING', False),
76+
('all', 'https://example.com/', 'mailto:foo@bar.com', False),
77+
('all', 'https://example.com/', 'javascript:alert(1)', False),
78+
('all', 'https://example.com/', 'ftp://example.com/', False),
7679
# 'same-hostname' is exact host equality
7780
('same-hostname', 'https://example.com/a', 'https://example.com/b', True),
7881
('same-hostname', 'https://example.com/', 'https://www.example.com/', False),
7982
('same-hostname', 'https://example.com/', 'https://other.test/', False),
83+
('same-hostname', 'https://example.com/', 'mailto:foo@example.com', False),
8084
# 'same-domain' allows subdomains under the same registrable domain
8185
('same-domain', 'https://example.com/', 'https://www.example.com/', True),
8286
('same-domain', 'https://example.com/', 'https://api.example.com/', True),
8387
('same-domain', 'https://example.com/', 'https://other.test/', False),
88+
('same-domain', 'https://example.com/', 'ftp://www.example.com/', False),
8489
# 'same-origin' requires scheme + host + port match
8590
('same-origin', 'https://example.com/', 'https://example.com/path', True),
8691
('same-origin', 'https://example.com/', 'http://example.com/', False),

tests/unit/request_loaders/test_sitemap_request_loader.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,3 +398,25 @@ async def test_sitemap_loader_strategy_all_disables_filtering(server_url: URL, h
398398
await loader.mark_request_as_handled(request)
399399

400400
assert fetched == [cross_host_url]
401+
402+
403+
async def test_sitemap_loader_drops_non_http_scheme_under_strategy_all(
404+
server_url: URL, http_client: HttpClient
405+
) -> None:
406+
"""Even with `enqueue_strategy='all'`, sitemap entries with non-http(s) schemes are dropped."""
407+
http_url = 'http://other.test/page'
408+
sitemap_content = _make_urlset(
409+
[http_url, 'mailto:foo@bar.com', 'javascript:alert(1)', 'ftp://example.com/file.txt']
410+
)
411+
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(sitemap_content.encode()))
412+
413+
loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all')
414+
415+
fetched: list[str] = []
416+
while not await loader.is_finished():
417+
request = await loader.fetch_next_request()
418+
if request is not None:
419+
fetched.append(request.url)
420+
await loader.mark_request_as_handled(request)
421+
422+
assert fetched == [http_url]

0 commit comments

Comments
 (0)