schema check + enqueue strategy check

vdusek · vdusek · commit 839a4bd18468 · 2026-05-04T15:13:11.000+02:00
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -7,7 +7,7 @@
 from yarl import URL
 
 from crawlee._utils.sitemap import Sitemap
-from crawlee._utils.urls import matches_enqueue_strategy
+from crawlee._utils.urls import UNSUPPORTED_SCHEME_MESSAGE, is_supported_url_scheme, matches_enqueue_strategy
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
@@ -106,17 +106,25 @@ def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
         """
         sitemaps: list[str] = []
         for sitemap_url in self._robots.sitemaps:
-            if matches_enqueue_strategy(
+            if not is_supported_url_scheme(sitemap_url):
+                logger.warning(
+                    f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
+                    f'{UNSUPPORTED_SCHEME_MESSAGE}'
+                )
+                continue
+
+            if not matches_enqueue_strategy(
                 strategy=enqueue_strategy,
                 target_url=sitemap_url,
                 origin_url=self._original_url,
             ):
-                sitemaps.append(sitemap_url)
-            else:
                 logger.warning(
                     f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
                     f'does not match enqueue strategy {enqueue_strategy!r}.'
                 )
+                continue
+
+            sitemaps.append(sitemap_url)
         return sitemaps
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -19,6 +19,9 @@
 _ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
 """URL schemes Crawlee accepts for fetching and enqueuing."""
 
+UNSUPPORTED_SCHEME_MESSAGE = 'unsupported URL scheme (only http and https are allowed).'
+"""Reusable suffix for log messages explaining why a non-`http(s)` URL was rejected."""
+
 _HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
 """Pydantic validator for HTTP and HTTPS URLs."""
 
@@ -66,6 +69,11 @@ def validate_http_url(value: str | None) -> str | None:
     return value
 
 
+def is_supported_url_scheme(url: str | URL) -> bool:
+    """Return whether `url` uses a scheme Crawlee accepts (http or https)."""
+    return _to_url(url).scheme in _ALLOWED_SCHEMES
+
+
 def matches_enqueue_strategy(
     strategy: EnqueueStrategy,
     *,
@@ -74,7 +82,9 @@ def matches_enqueue_strategy(
 ) -> bool:
     """Check whether `target_url` matches `origin_url` under the given enqueue strategy.
 
-    Targets with non-http(s) schemes are always rejected, including under `strategy='all'`.
+    This function checks only the strategy relationship between the two URLs. Callers must
+    independently reject unsupported schemes via `is_supported_url_scheme` — `matches_enqueue_strategy`
+    does not look at the scheme.
 
     Args:
         strategy: The enqueue strategy to apply.
@@ -84,11 +94,8 @@ def matches_enqueue_strategy(
     Returns:
         `True` if `target_url` is allowed under `strategy` relative to `origin_url`, `False` otherwise.
     """
-    target = URL(target_url) if isinstance(target_url, str) else target_url
-    origin = URL(origin_url) if isinstance(origin_url, str) else origin_url
-
-    if target.scheme not in _ALLOWED_SCHEMES:
-        return False
+    target = _to_url(target_url)
+    origin = _to_url(origin_url)
 
     if strategy == 'all':
         return True
@@ -108,6 +115,10 @@ def matches_enqueue_strategy(
     assert_never(strategy)
 
 
+def _to_url(value: str | URL) -> URL:
+    return URL(value) if isinstance(value, str) else value
+
+
 @lru_cache(maxsize=1)
 def _get_tld_extractor() -> TLDExtract:
     """Return a lazily-initialized `TLDExtract` instance shared across the module."""
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -45,7 +45,13 @@
 from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
-from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, matches_enqueue_strategy
+from crawlee._utils.urls import (
+    UNSUPPORTED_SCHEME_MESSAGE,
+    convert_to_absolute_url,
+    is_supported_url_scheme,
+    is_url_absolute,
+    matches_enqueue_strategy,
+)
 from crawlee._utils.wait import wait_for
 from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
 from crawlee.errors import (
@@ -974,16 +980,23 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
     async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
         """Ensure that the `loaded_url` still matches the enqueue strategy after redirects.
 
-        Filter out links that redirect outside of the crawled domain.
+        Filter out links that redirect outside of the crawled domain or to unsupported URL schemes.
         """
-        if context.request.loaded_url is not None and not matches_enqueue_strategy(
-            strategy=context.request.enqueue_strategy,
-            origin_url=URL(context.request.url),
-            target_url=URL(context.request.loaded_url),
-        ):
-            raise ContextPipelineInterruptedError(
-                f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'
-            )
+        if context.request.loaded_url is not None:
+            if not is_supported_url_scheme(context.request.loaded_url):
+                raise ContextPipelineInterruptedError(
+                    f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url}): '
+                    f'{UNSUPPORTED_SCHEME_MESSAGE}'
+                )
+
+            if not matches_enqueue_strategy(
+                strategy=context.request.enqueue_strategy,
+                origin_url=URL(context.request.url),
+                target_url=URL(context.request.loaded_url),
+            ):
+                raise ContextPipelineInterruptedError(
+                    f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'
+                )
 
         yield context
 
@@ -1057,8 +1070,9 @@ def _enqueue_links_filter_iterator(
             self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')
             return
 
-        # Emit a `warning` message to the log, only once per call
-        warning_flag = True
+        # Each warning is emitted at most once per call.
+        host_warned = False
+        scheme_warned = False
 
         for request in request_iterator:
             if isinstance(request, Request):
@@ -1069,9 +1083,15 @@ def _enqueue_links_filter_iterator(
                 target_url = request
             parsed_target_url = URL(target_url)
 
-            if warning_flag and strategy != 'all' and not parsed_target_url.host:
+            if not is_supported_url_scheme(parsed_target_url):
+                if not scheme_warned:
+                    self.log.warning(f'Skipping enqueue url {target_url!r}: {UNSUPPORTED_SCHEME_MESSAGE}')
+                    scheme_warned = True
+                continue
+
+            if not host_warned and strategy != 'all' and not parsed_target_url.host:
                 self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
-                warning_flag = False
+                host_warned = True
 
             if matches_enqueue_strategy(
                 strategy=strategy,
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -15,7 +15,7 @@
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
 from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
-from crawlee._utils.urls import matches_enqueue_strategy
+from crawlee._utils.urls import UNSUPPORTED_SCHEME_MESSAGE, is_supported_url_scheme, matches_enqueue_strategy
 from crawlee.request_loaders._request_loader import RequestLoader
 
 if TYPE_CHECKING:
@@ -200,17 +200,20 @@ async def _get_state(self) -> SitemapRequestLoaderState:
 
             return self._state.current_value
 
-    def _passes_strategy_filter(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
-        """Check whether `target` matches the loader's enqueue strategy relative to `parent`, logging if not."""
-        if matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
-            return True
+    def _passes_filters(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
+        """Filter `target` by URL scheme first, then by enqueue strategy, logging the reason if rejected."""
+        if not is_supported_url_scheme(target):
+            logger.warning(f'Skipping {kind} {target!r}: {UNSUPPORTED_SCHEME_MESSAGE}')
+            return False
 
-        logger.warning(
-            f'Skipping {kind} {target!r}: does not match enqueue strategy '
-            f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
-        )
+        if not matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
+            logger.warning(
+                f'Skipping {kind} {target!r}: does not match enqueue strategy '
+                f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
+            )
+            return False
 
-        return False
+        return True
 
     def _check_url_patterns(
         self,
@@ -268,9 +271,7 @@ async def _load_sitemaps(self) -> None:
                     if isinstance(item, NestedSitemap):
                         # Add nested sitemap to queue
                         if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
-                            if not self._passes_strategy_filter(
-                                item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'
-                            ):
+                            if not self._passes_filters(item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'):
                                 continue
                             state.pending_sitemap_urls.append(item.loc)
                         continue
@@ -288,7 +289,7 @@ async def _load_sitemaps(self) -> None:
                         if not self._check_url_patterns(url, self._include, self._exclude):
                             continue
 
-                        if not self._passes_strategy_filter(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
+                        if not self._passes_filters(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
                             continue
 
                         # Check if we have capacity in the queue