Address feedback

vdusek · vdusek · commit 610ae4624c35 · 2026-05-04T13:59:43.000+02:00
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -13,6 +13,7 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    from crawlee._types import EnqueueStrategy
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
 
@@ -22,7 +23,11 @@
 
 class RobotsTxtFile:
     def __init__(
-        self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
+        self,
+        url: str,
+        robots: Protego,
+        http_client: HttpClient | None = None,
+        proxy_info: ProxyInfo | None = None,
     ) -> None:
         self._robots = robots
         self._original_url = URL(url).origin()
@@ -90,18 +95,29 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
             return True
         return bool(self._robots.can_fetch(str(check_url), user_agent))
 
-    def get_sitemaps(self) -> list[str]:
-        """Get the list of same-host sitemap URLs from the robots.txt file."""
-        same_host_sitemaps: list[str] = []
+    def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
+        """Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.
+
+        Args:
+            enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
+                Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
+                disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
+                always filtered out.
+        """
+        sitemaps: list[str] = []
         for sitemap_url in self._robots.sitemaps:
-            if matches_enqueue_strategy('same-hostname', target_url=sitemap_url, origin_url=self._original_url):
-                same_host_sitemaps.append(sitemap_url)
+            if matches_enqueue_strategy(
+                strategy=enqueue_strategy,
+                target_url=sitemap_url,
+                origin_url=self._original_url,
+            ):
+                sitemaps.append(sitemap_url)
             else:
                 logger.warning(
                     f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
-                    f'cross-host sitemap entries are not allowed by the robots.txt specification.'
+                    f'does not match enqueue strategy {enqueue_strategy!r}.'
                 )
-        return same_host_sitemaps
+        return sitemaps
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """Get the crawl delay for the given user agent.
@@ -113,15 +129,23 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
 
-    async def parse_sitemaps(self) -> Sitemap:
-        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
-        sitemaps = self.get_sitemaps()
+    async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
+        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.
+
+        Args:
+            enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
+        """
+        sitemaps = self.get_sitemaps(enqueue_strategy=enqueue_strategy)
         if not self._http_client:
             raise ValueError('HTTP client is required to parse sitemaps.')
 
         return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
 
-    async def parse_urls_from_sitemaps(self) -> list[str]:
-        """Parse the sitemaps in the robots.txt file and return a list URLs."""
-        sitemap = await self.parse_sitemaps()
+    async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
+        """Parse the sitemaps in the robots.txt file and return a list URLs.
+
+        Args:
+            enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
+        """
+        sitemap = await self.parse_sitemaps(enqueue_strategy=enqueue_strategy)
         return sitemap.urls
diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
@@ -546,7 +546,7 @@ def _check_and_add(url: str) -> bool:
 
     # Try getting sitemaps from robots.txt first
     robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
-    for sitemap_url in robots.get_sitemaps():
+    for sitemap_url in robots.get_sitemaps(enqueue_strategy='same-hostname'):
         if _check_and_add(sitemap_url):
             yield sitemap_url
 
diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -16,6 +16,13 @@
     from crawlee._types import EnqueueStrategy
 
 
+_ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
+"""URL schemes Crawlee accepts for fetching and enqueuing."""
+
+_HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
+"""Pydantic validator for HTTP and HTTPS URLs."""
+
+
 def is_url_absolute(url: str) -> bool:
     """Check if a URL is absolute."""
     url_parsed = URL(url)
@@ -44,30 +51,21 @@ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger
             yield converted_url
 
 
-_http_url_adapter = TypeAdapter(AnyHttpUrl)
-
-
 def validate_http_url(value: str | None) -> str | None:
     """Validate the given HTTP URL.
 
+    Args:
+        value: The URL to validate, or `None` to skip validation.
+
     Raises:
-        pydantic.ValidationError: If the URL is not valid.
+        pydantic.ValidationError: If the URL is malformed or its scheme is not `http`/`https`.
     """
     if value is not None:
-        _http_url_adapter.validate_python(value)
+        _HTTP_URL_ADAPTER.validate_python(value)
 
     return value
 
 
-@lru_cache(maxsize=1)
-def _get_tld_extractor() -> TLDExtract:
-    """Return a lazily-initialized `TLDExtract` instance shared across the module."""
-    # `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
-    # is collected immediately when its return value is discarded, which would race the directory out from under
-    # tldextract.
-    return TLDExtract(cache_dir=tempfile.mkdtemp())
-
-
 def matches_enqueue_strategy(
     strategy: EnqueueStrategy,
     *,
@@ -76,6 +74,8 @@ def matches_enqueue_strategy(
 ) -> bool:
     """Check whether `target_url` matches `origin_url` under the given enqueue strategy.
 
+    Targets with non-http(s) schemes are always rejected, including under `strategy='all'`.
+
     Args:
         strategy: The enqueue strategy to apply.
         target_url: The URL to be evaluated.
@@ -87,6 +87,9 @@ def matches_enqueue_strategy(
     target = URL(target_url) if isinstance(target_url, str) else target_url
     origin = URL(origin_url) if isinstance(origin_url, str) else origin_url
 
+    if target.scheme not in _ALLOWED_SCHEMES:
+        return False
+
     if strategy == 'all':
         return True
 
@@ -97,12 +100,24 @@ def matches_enqueue_strategy(
         return target.host == origin.host
 
     if strategy == 'same-domain':
-        extractor = _get_tld_extractor()
-        origin_domain = extractor.extract_str(origin.host).top_domain_under_public_suffix
-        target_domain = extractor.extract_str(target.host).top_domain_under_public_suffix
-        return origin_domain == target_domain
+        return _domain_under_public_suffix(origin.host) == _domain_under_public_suffix(target.host)
 
     if strategy == 'same-origin':
         return target.host == origin.host and target.scheme == origin.scheme and target.port == origin.port
 
     assert_never(strategy)
+
+
+@lru_cache(maxsize=1)
+def _get_tld_extractor() -> TLDExtract:
+    """Return a lazily-initialized `TLDExtract` instance shared across the module."""
+    # `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
+    # is collected immediately when its return value is discarded, which would race the directory out from under
+    # tldextract.
+    return TLDExtract(cache_dir=tempfile.mkdtemp())
+
+
+@lru_cache(maxsize=2048)
+def _domain_under_public_suffix(host: str) -> str:
+    """Return the registrable domain for `host`, cached to avoid re-running the PSL lookup."""
+    return _get_tld_extractor().extract_str(host).top_domain_under_public_suffix
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -977,7 +977,7 @@ async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGe
         Filter out links that redirect outside of the crawled domain.
         """
         if context.request.loaded_url is not None and not matches_enqueue_strategy(
-            context.request.enqueue_strategy,
+            strategy=context.request.enqueue_strategy,
             origin_url=URL(context.request.url),
             target_url=URL(context.request.loaded_url),
         ):
@@ -1074,7 +1074,9 @@ def _enqueue_links_filter_iterator(
                 warning_flag = False
 
             if matches_enqueue_strategy(
-                strategy, target_url=parsed_target_url, origin_url=parsed_origin_url
+                strategy=strategy,
+                target_url=parsed_target_url,
+                origin_url=parsed_origin_url,
             ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
                 yield request
 
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -129,7 +129,8 @@ def __init__(
             enqueue_strategy: Strategy used to decide which sitemap-derived URLs (both nested-sitemap entries and
                 URL entries) are kept relative to the parent sitemap URL. Defaults to `'same-hostname'`, matching
                 the sitemap protocol's same-host expectation and the `enqueue_links` default; pass `'all'` to
-                disable filtering.
+                disable filtering. Note: regardless of `enqueue_strategy`, entries with non-`http(s)` schemes are
+                always filtered out.
             max_buffer_size: Maximum number of URLs to buffer in memory.
             http_client: the instance of `HttpClient` to use for fetching sitemaps.
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
@@ -199,6 +200,18 @@ async def _get_state(self) -> SitemapRequestLoaderState:
 
             return self._state.current_value
 
+    def _passes_strategy_filter(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
+        """Check whether `target` matches the loader's enqueue strategy relative to `parent`, logging if not."""
+        if matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
+            return True
+
+        logger.warning(
+            f'Skipping {kind} {target!r}: does not match enqueue strategy '
+            f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
+        )
+
+        return False
+
     def _check_url_patterns(
         self,
         target_url: str,
@@ -244,8 +257,6 @@ async def _load_sitemaps(self) -> None:
                     state.in_progress_sitemap_url = sitemap_url
 
                 parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
-                # Parse the parent sitemap URL once per outer iteration; `matches_enqueue_strategy` is called per
-                # entry below, and re-parsing the same string thousands of times for large sitemaps is wasteful.
                 parsed_sitemap_url = URL(sitemap_url)
 
                 async for item in parse_sitemap(
@@ -257,13 +268,9 @@ async def _load_sitemaps(self) -> None:
                     if isinstance(item, NestedSitemap):
                         # Add nested sitemap to queue
                         if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
-                            if not matches_enqueue_strategy(
-                                self._enqueue_strategy, target_url=item.loc, origin_url=parsed_sitemap_url
+                            if not self._passes_strategy_filter(
+                                item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'
                             ):
-                                logger.warning(
-                                    f'Skipping nested sitemap {item.loc!r}: does not match enqueue strategy '
-                                    f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
-                                )
                                 continue
                             state.pending_sitemap_urls.append(item.loc)
                         continue
@@ -281,13 +288,7 @@ async def _load_sitemaps(self) -> None:
                         if not self._check_url_patterns(url, self._include, self._exclude):
                             continue
 
-                        if not matches_enqueue_strategy(
-                            self._enqueue_strategy, target_url=url, origin_url=parsed_sitemap_url
-                        ):
-                            logger.warning(
-                                f'Skipping sitemap URL {url!r}: does not match enqueue strategy '
-                                f'{self._enqueue_strategy!r} relative to {sitemap_url!r}.'
-                            )
+                        if not self._passes_strategy_filter(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
                             continue
 
                         # Check if we have capacity in the queue
diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py
@@ -26,33 +26,49 @@ async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClien
 
 
 async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
-    """Cross-host sitemap entries are dropped from the test fixture's robots.txt."""
+    """Cross-host sitemap entries are dropped under the `'same-hostname'` enqueue strategy."""
     robots = await RobotsTxtFile.find(str(server_url), http_client)
-    # The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url` and
-    # therefore filtered out per the robots.txt specification.
-    assert robots.get_sitemaps() == []
+    # The fixture lists `http://not-exists.com/sitemap_*.xml`, which is cross-host relative to `server_url`.
+    assert robots.get_sitemaps(enqueue_strategy='same-hostname') == []
 
 
 async def test_extract_same_host_sitemaps_urls() -> None:
     """Sitemap entries on the same host as the robots.txt are returned."""
     content = 'User-agent: *\nSitemap: http://example.com/sitemap_1.xml\nSitemap: http://example.com/sitemap_2.xml\n'
     robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
-    assert set(robots.get_sitemaps()) == {
+    assert set(robots.get_sitemaps(enqueue_strategy='same-hostname')) == {
         'http://example.com/sitemap_1.xml',
         'http://example.com/sitemap_2.xml',
     }
 
 
-async def test_extract_sitemaps_urls_filters_cross_host() -> None:
-    """Cross-host `Sitemap:` directives in robots.txt are silently filtered."""
+async def test_extract_sitemaps_urls_filters_cross_host_and_non_http() -> None:
+    """Cross-host and non-http(s) `Sitemap:` directives in robots.txt are silently filtered."""
     content = (
         'User-agent: *\n'
         'Sitemap: http://example.com/legit.xml\n'
         'Sitemap: http://other.test/payload.xml\n'
         'Sitemap: gopher://internal:6379/_PING\n'
+        'Sitemap: ftp://example.com/payload.xml\n'
     )
     robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
-    assert robots.get_sitemaps() == ['http://example.com/legit.xml']
+    assert robots.get_sitemaps(enqueue_strategy='same-hostname') == ['http://example.com/legit.xml']
+
+
+async def test_get_sitemaps_with_strategy_all_returns_cross_host() -> None:
+    """`enqueue_strategy='all'` disables host filtering but still rejects non-http(s) schemes."""
+    content = (
+        'User-agent: *\n'
+        'Sitemap: http://example.com/legit.xml\n'
+        'Sitemap: http://other.test/payload.xml\n'
+        'Sitemap: gopher://internal:6379/_PING\n'
+        'Sitemap: ftp://example.com/payload.xml\n'
+    )
+    robots = await RobotsTxtFile.from_content('http://example.com/robots.txt', content)
+    assert set(robots.get_sitemaps(enqueue_strategy='all')) == {
+        'http://example.com/legit.xml',
+        'http://other.test/payload.xml',
+    }
 
 
 async def test_parse_from_content() -> None:
diff --git a/tests/unit/_utils/test_urls.py b/tests/unit/_utils/test_urls.py
@@ -70,17 +70,22 @@ def test_validate_http_url_rejects_non_http_scheme(invalid_url: str) -> None:
 @pytest.mark.parametrize(
     ('strategy', 'origin', 'target', 'expected'),
     [
-        # 'all' lets everything through, even with empty/cross-host targets
+        # 'all' lets http(s) through across hosts, but rejects non-http(s) schemes
         ('all', 'https://example.com/', 'https://other.test/', True),
-        ('all', 'https://example.com/', 'gopher://internal:6379/_PING', True),
+        ('all', 'https://example.com/', 'gopher://internal:6379/_PING', False),
+        ('all', 'https://example.com/', 'mailto:foo@bar.com', False),
+        ('all', 'https://example.com/', 'javascript:alert(1)', False),
+        ('all', 'https://example.com/', 'ftp://example.com/', False),
         # 'same-hostname' is exact host equality
         ('same-hostname', 'https://example.com/a', 'https://example.com/b', True),
         ('same-hostname', 'https://example.com/', 'https://www.example.com/', False),
         ('same-hostname', 'https://example.com/', 'https://other.test/', False),
+        ('same-hostname', 'https://example.com/', 'mailto:foo@example.com', False),
         # 'same-domain' allows subdomains under the same registrable domain
         ('same-domain', 'https://example.com/', 'https://www.example.com/', True),
         ('same-domain', 'https://example.com/', 'https://api.example.com/', True),
         ('same-domain', 'https://example.com/', 'https://other.test/', False),
+        ('same-domain', 'https://example.com/', 'ftp://www.example.com/', False),
         # 'same-origin' requires scheme + host + port match
         ('same-origin', 'https://example.com/', 'https://example.com/path', True),
         ('same-origin', 'https://example.com/', 'http://example.com/', False),
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py
@@ -398,3 +398,25 @@ async def test_sitemap_loader_strategy_all_disables_filtering(server_url: URL, h
             await loader.mark_request_as_handled(request)
 
     assert fetched == [cross_host_url]
+
+
+async def test_sitemap_loader_drops_non_http_scheme_under_strategy_all(
+    server_url: URL, http_client: HttpClient
+) -> None:
+    """Even with `enqueue_strategy='all'`, sitemap entries with non-http(s) schemes are dropped."""
+    http_url = 'http://other.test/page'
+    sitemap_content = _make_urlset(
+        [http_url, 'mailto:foo@bar.com', 'javascript:alert(1)', 'ftp://example.com/file.txt']
+    )
+    sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(sitemap_content.encode()))
+
+    loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all')
+
+    fetched: list[str] = []
+    while not await loader.is_finished():
+        request = await loader.fetch_next_request()
+        if request is not None:
+            fetched.append(request.url)
+            await loader.mark_request_as_handled(request)
+
+    assert fetched == [http_url]