Skip to content

Commit 839a4bd

Browse files
committed
schema check + enqueue strategy check
1 parent 610ae46 commit 839a4bd

4 files changed

Lines changed: 78 additions & 38 deletions

File tree

src/crawlee/_utils/robots.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from yarl import URL
88

99
from crawlee._utils.sitemap import Sitemap
10-
from crawlee._utils.urls import matches_enqueue_strategy
10+
from crawlee._utils.urls import UNSUPPORTED_SCHEME_MESSAGE, is_supported_url_scheme, matches_enqueue_strategy
1111
from crawlee._utils.web import is_status_code_client_error
1212

1313
if TYPE_CHECKING:
@@ -106,17 +106,25 @@ def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
106106
"""
107107
sitemaps: list[str] = []
108108
for sitemap_url in self._robots.sitemaps:
109-
if matches_enqueue_strategy(
109+
if not is_supported_url_scheme(sitemap_url):
110+
logger.warning(
111+
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
112+
f'{UNSUPPORTED_SCHEME_MESSAGE}'
113+
)
114+
continue
115+
116+
if not matches_enqueue_strategy(
110117
strategy=enqueue_strategy,
111118
target_url=sitemap_url,
112119
origin_url=self._original_url,
113120
):
114-
sitemaps.append(sitemap_url)
115-
else:
116121
logger.warning(
117122
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: '
118123
f'does not match enqueue strategy {enqueue_strategy!r}.'
119124
)
125+
continue
126+
127+
sitemaps.append(sitemap_url)
120128
return sitemaps
121129

122130
def get_crawl_delay(self, user_agent: str = '*') -> int | None:

src/crawlee/_utils/urls.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
_ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
2020
"""URL schemes Crawlee accepts for fetching and enqueuing."""
2121

22+
UNSUPPORTED_SCHEME_MESSAGE = 'unsupported URL scheme (only http and https are allowed).'
23+
"""Reusable suffix for log messages explaining why a non-`http(s)` URL was rejected."""
24+
2225
_HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
2326
"""Pydantic validator for HTTP and HTTPS URLs."""
2427

@@ -66,6 +69,11 @@ def validate_http_url(value: str | None) -> str | None:
6669
return value
6770

6871

72+
def is_supported_url_scheme(url: str | URL) -> bool:
73+
"""Return whether `url` uses a scheme Crawlee accepts (http or https)."""
74+
return _to_url(url).scheme in _ALLOWED_SCHEMES
75+
76+
6977
def matches_enqueue_strategy(
7078
strategy: EnqueueStrategy,
7179
*,
@@ -74,7 +82,9 @@ def matches_enqueue_strategy(
7482
) -> bool:
7583
"""Check whether `target_url` matches `origin_url` under the given enqueue strategy.
7684
77-
Targets with non-http(s) schemes are always rejected, including under `strategy='all'`.
85+
This function checks only the strategy relationship between the two URLs. Callers must
86+
independently reject unsupported schemes via `is_supported_url_scheme` — `matches_enqueue_strategy`
87+
does not look at the scheme.
7888
7989
Args:
8090
strategy: The enqueue strategy to apply.
@@ -84,11 +94,8 @@ def matches_enqueue_strategy(
8494
Returns:
8595
`True` if `target_url` is allowed under `strategy` relative to `origin_url`, `False` otherwise.
8696
"""
87-
target = URL(target_url) if isinstance(target_url, str) else target_url
88-
origin = URL(origin_url) if isinstance(origin_url, str) else origin_url
89-
90-
if target.scheme not in _ALLOWED_SCHEMES:
91-
return False
97+
target = _to_url(target_url)
98+
origin = _to_url(origin_url)
9299

93100
if strategy == 'all':
94101
return True
@@ -108,6 +115,10 @@ def matches_enqueue_strategy(
108115
assert_never(strategy)
109116

110117

118+
def _to_url(value: str | URL) -> URL:
119+
return URL(value) if isinstance(value, str) else value
120+
121+
111122
@lru_cache(maxsize=1)
112123
def _get_tld_extractor() -> TLDExtract:
113124
"""Return a lazily-initialized `TLDExtract` instance shared across the module."""

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@
4545
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
4646
from crawlee._utils.recurring_task import RecurringTask
4747
from crawlee._utils.robots import RobotsTxtFile
48-
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute, matches_enqueue_strategy
48+
from crawlee._utils.urls import (
49+
UNSUPPORTED_SCHEME_MESSAGE,
50+
convert_to_absolute_url,
51+
is_supported_url_scheme,
52+
is_url_absolute,
53+
matches_enqueue_strategy,
54+
)
4955
from crawlee._utils.wait import wait_for
5056
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
5157
from crawlee.errors import (
@@ -974,16 +980,23 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
974980
async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
975981
"""Ensure that the `loaded_url` still matches the enqueue strategy after redirects.
976982
977-
Filter out links that redirect outside of the crawled domain.
983+
Filter out links that redirect outside of the crawled domain or to unsupported URL schemes.
978984
"""
979-
if context.request.loaded_url is not None and not matches_enqueue_strategy(
980-
strategy=context.request.enqueue_strategy,
981-
origin_url=URL(context.request.url),
982-
target_url=URL(context.request.loaded_url),
983-
):
984-
raise ContextPipelineInterruptedError(
985-
f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'
986-
)
985+
if context.request.loaded_url is not None:
986+
if not is_supported_url_scheme(context.request.loaded_url):
987+
raise ContextPipelineInterruptedError(
988+
f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url}): '
989+
f'{UNSUPPORTED_SCHEME_MESSAGE}'
990+
)
991+
992+
if not matches_enqueue_strategy(
993+
strategy=context.request.enqueue_strategy,
994+
origin_url=URL(context.request.url),
995+
target_url=URL(context.request.loaded_url),
996+
):
997+
raise ContextPipelineInterruptedError(
998+
f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})'
999+
)
9871000

9881001
yield context
9891002

@@ -1057,8 +1070,9 @@ def _enqueue_links_filter_iterator(
10571070
self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')
10581071
return
10591072

1060-
# Emit a `warning` message to the log, only once per call
1061-
warning_flag = True
1073+
# Each warning is emitted at most once per call.
1074+
host_warned = False
1075+
scheme_warned = False
10621076

10631077
for request in request_iterator:
10641078
if isinstance(request, Request):
@@ -1069,9 +1083,15 @@ def _enqueue_links_filter_iterator(
10691083
target_url = request
10701084
parsed_target_url = URL(target_url)
10711085

1072-
if warning_flag and strategy != 'all' and not parsed_target_url.host:
1086+
if not is_supported_url_scheme(parsed_target_url):
1087+
if not scheme_warned:
1088+
self.log.warning(f'Skipping enqueue url {target_url!r}: {UNSUPPORTED_SCHEME_MESSAGE}')
1089+
scheme_warned = True
1090+
continue
1091+
1092+
if not host_warned and strategy != 'all' and not parsed_target_url.host:
10731093
self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
1074-
warning_flag = False
1094+
host_warned = True
10751095

10761096
if matches_enqueue_strategy(
10771097
strategy=strategy,

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from crawlee._utils.globs import Glob
1616
from crawlee._utils.recoverable_state import RecoverableState
1717
from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
18-
from crawlee._utils.urls import matches_enqueue_strategy
18+
from crawlee._utils.urls import UNSUPPORTED_SCHEME_MESSAGE, is_supported_url_scheme, matches_enqueue_strategy
1919
from crawlee.request_loaders._request_loader import RequestLoader
2020

2121
if TYPE_CHECKING:
@@ -200,17 +200,20 @@ async def _get_state(self) -> SitemapRequestLoaderState:
200200

201201
return self._state.current_value
202202

203-
def _passes_strategy_filter(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
204-
"""Check whether `target` matches the loader's enqueue strategy relative to `parent`, logging if not."""
205-
if matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
206-
return True
203+
def _passes_filters(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
204+
"""Filter `target` by URL scheme first, then by enqueue strategy, logging the reason if rejected."""
205+
if not is_supported_url_scheme(target):
206+
logger.warning(f'Skipping {kind} {target!r}: {UNSUPPORTED_SCHEME_MESSAGE}')
207+
return False
207208

208-
logger.warning(
209-
f'Skipping {kind} {target!r}: does not match enqueue strategy '
210-
f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
211-
)
209+
if not matches_enqueue_strategy(strategy=self._enqueue_strategy, target_url=target, origin_url=parent):
210+
logger.warning(
211+
f'Skipping {kind} {target!r}: does not match enqueue strategy '
212+
f'{self._enqueue_strategy!r} relative to {parent_url!r}.'
213+
)
214+
return False
212215

213-
return False
216+
return True
214217

215218
def _check_url_patterns(
216219
self,
@@ -268,9 +271,7 @@ async def _load_sitemaps(self) -> None:
268271
if isinstance(item, NestedSitemap):
269272
# Add nested sitemap to queue
270273
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
271-
if not self._passes_strategy_filter(
272-
item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'
273-
):
274+
if not self._passes_filters(item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'):
274275
continue
275276
state.pending_sitemap_urls.append(item.loc)
276277
continue
@@ -288,7 +289,7 @@ async def _load_sitemaps(self) -> None:
288289
if not self._check_url_patterns(url, self._include, self._exclude):
289290
continue
290291

291-
if not self._passes_strategy_filter(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
292+
if not self._passes_filters(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
292293
continue
293294

294295
# Check if we have capacity in the queue

0 commit comments

Comments
 (0)