Skip to content

Commit 4ba08d1

Browse files
vdusekclaude
andcommitted
simplify filter_url helpers and callers
- pass already-parsed URL through to _matches_enqueue_strategy (drops redundant _to_url calls per URL on the hot path) - drop redundant parent_url param from _passes_filters; use str(parent) - drop dead `strategy != 'all'` check in enqueue iterator (only reachable on non-'all' strategies after scheme passes) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4de1543 commit 4ba08d1

3 files changed

Lines changed: 23 additions & 37 deletions

File tree

src/crawlee/_utils/urls.py

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,13 @@ def filter_url(
9292
a human-readable rejection message suitable for log output.
9393
"""
9494
target_url = _to_url(target)
95+
9596
if not _is_supported_url_scheme(target_url):
9697
return False, UNSUPPORTED_SCHEME_MESSAGE
97-
if not _matches_enqueue_strategy(strategy, target_url=target_url, origin_url=origin):
98+
99+
if not _matches_enqueue_strategy(strategy, target_url=target_url, origin_url=_to_url(origin)):
98100
return False, f'does not match enqueue strategy {strategy!r}'
101+
99102
return True, None
100103

101104

@@ -107,39 +110,28 @@ def _is_supported_url_scheme(url: str | URL) -> bool:
107110
def _matches_enqueue_strategy(
108111
strategy: EnqueueStrategy,
109112
*,
110-
target_url: str | URL,
111-
origin_url: str | URL,
113+
target_url: URL,
114+
origin_url: URL,
112115
) -> bool:
113-
"""Check whether `target_url` matches `origin_url` under the given enqueue strategy.
114-
115-
Pure strategy comparison — does not look at the scheme. Callers should go through `filter_url`,
116-
which combines this with a scheme check.
117-
118-
Args:
119-
strategy: The enqueue strategy to apply.
120-
target_url: The URL to be evaluated.
121-
origin_url: The reference URL the target is compared against.
122-
123-
Returns:
124-
`True` if `target_url` is allowed under `strategy` relative to `origin_url`, `False` otherwise.
125-
"""
126-
target = _to_url(target_url)
127-
origin = _to_url(origin_url)
128-
116+
"""Check whether `target_url` matches `origin_url` under `strategy`. Scheme is not considered."""
129117
if strategy == 'all':
130118
return True
131119

132-
if origin.host is None or target.host is None:
120+
if origin_url.host is None or target_url.host is None:
133121
return False
134122

135123
if strategy == 'same-hostname':
136-
return target.host == origin.host
124+
return target_url.host == origin_url.host
137125

138126
if strategy == 'same-domain':
139-
return _domain_under_public_suffix(origin.host) == _domain_under_public_suffix(target.host)
127+
return _domain_under_public_suffix(origin_url.host) == _domain_under_public_suffix(target_url.host)
140128

141129
if strategy == 'same-origin':
142-
return target.host == origin.host and target.scheme == origin.scheme and target.port == origin.port
130+
return (
131+
target_url.host == origin_url.host
132+
and target_url.scheme == origin_url.scheme
133+
and target_url.port == origin_url.port
134+
)
143135

144136
assert_never(strategy)
145137

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,7 @@
4545
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
4646
from crawlee._utils.recurring_task import RecurringTask
4747
from crawlee._utils.robots import RobotsTxtFile
48-
from crawlee._utils.urls import (
49-
UNSUPPORTED_SCHEME_MESSAGE,
50-
convert_to_absolute_url,
51-
filter_url,
52-
is_url_absolute,
53-
)
48+
from crawlee._utils.urls import UNSUPPORTED_SCHEME_MESSAGE, convert_to_absolute_url, filter_url, is_url_absolute
5449
from crawlee._utils.wait import wait_for
5550
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
5651
from crawlee.errors import (
@@ -1079,14 +1074,13 @@ def _enqueue_links_filter_iterator(
10791074

10801075
ok, reason = filter_url(target=parsed_target_url, strategy=strategy, origin=parsed_origin_url)
10811076
if not ok:
1082-
# Strategy mismatches are expected (most extracted links are external) so they're
1083-
# silent. Scheme rejections and missing hostnames signal a misconfiguration upstream,
1084-
# so we warn — but only once per call to avoid spamming when a page yields many.
1077+
# Strategy mismatches are expected (most extracted links are external) so stay silent.
1078+
# Scheme rejections and missing hostnames signal a misconfiguration upstream, so warn.
10851079
if reason == UNSUPPORTED_SCHEME_MESSAGE:
10861080
if not scheme_warned:
10871081
self.log.warning(f'Skipping enqueue url {target_url!r}: {reason}')
10881082
scheme_warned = True
1089-
elif strategy != 'all' and not parsed_target_url.host and not host_warned:
1083+
elif not parsed_target_url.host and not host_warned:
10901084
self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
10911085
host_warned = True
10921086
continue

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,11 @@ async def _get_state(self) -> SitemapRequestLoaderState:
200200

201201
return self._state.current_value
202202

203-
def _passes_filters(self, target: str, parent: URL, parent_url: str, kind: str) -> bool:
203+
def _passes_filters(self, target: str, parent: URL, kind: str) -> bool:
204204
"""Filter `target` by URL scheme and enqueue strategy, logging the reason if rejected."""
205205
ok, reason = filter_url(target=target, strategy=self._enqueue_strategy, origin=parent)
206206
if not ok:
207-
logger.warning(f'Skipping {kind} {target!r} (parent {parent_url!r}): {reason}.')
207+
logger.warning(f'Skipping {kind} {target!r} (parent {str(parent)!r}): {reason}.')
208208
return False
209209
return True
210210

@@ -264,7 +264,7 @@ async def _load_sitemaps(self) -> None:
264264
if isinstance(item, NestedSitemap):
265265
# Add nested sitemap to queue
266266
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
267-
if not self._passes_filters(item.loc, parsed_sitemap_url, sitemap_url, 'nested sitemap'):
267+
if not self._passes_filters(item.loc, parsed_sitemap_url, 'nested sitemap'):
268268
continue
269269
state.pending_sitemap_urls.append(item.loc)
270270
continue
@@ -282,7 +282,7 @@ async def _load_sitemaps(self) -> None:
282282
if not self._check_url_patterns(url, self._include, self._exclude):
283283
continue
284284

285-
if not self._passes_filters(url, parsed_sitemap_url, sitemap_url, 'sitemap URL'):
285+
if not self._passes_filters(url, parsed_sitemap_url, 'sitemap URL'):
286286
continue
287287

288288
# Check if we have capacity in the queue

0 commit comments

Comments
 (0)