4545from crawlee ._utils .file import atomic_write , export_csv_to_stream , export_json_to_stream
4646from crawlee ._utils .recurring_task import RecurringTask
4747from crawlee ._utils .robots import RobotsTxtFile
48- from crawlee ._utils .urls import convert_to_absolute_url , is_url_absolute , matches_enqueue_strategy
48+ from crawlee ._utils .urls import (
49+ UNSUPPORTED_SCHEME_MESSAGE ,
50+ convert_to_absolute_url ,
51+ is_supported_url_scheme ,
52+ is_url_absolute ,
53+ matches_enqueue_strategy ,
54+ )
4955from crawlee ._utils .wait import wait_for
5056from crawlee ._utils .web import is_status_code_client_error , is_status_code_server_error
5157from crawlee .errors import (
@@ -974,16 +980,23 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
974980 async def _check_url_after_redirects (self , context : TCrawlingContext ) -> AsyncGenerator [TCrawlingContext , None ]:
975981 """Ensure that the `loaded_url` still matches the enqueue strategy after redirects.
976982
977- Filter out links that redirect outside of the crawled domain.
983+ Filter out links that redirect outside of the crawled domain or to unsupported URL schemes .
978984 """
979- if context .request .loaded_url is not None and not matches_enqueue_strategy (
980- strategy = context .request .enqueue_strategy ,
981- origin_url = URL (context .request .url ),
982- target_url = URL (context .request .loaded_url ),
983- ):
984- raise ContextPipelineInterruptedError (
985- f'Skipping URL { context .request .loaded_url } (redirected from { context .request .url } )'
986- )
985+ if context .request .loaded_url is not None :
986+ if not is_supported_url_scheme (context .request .loaded_url ):
987+ raise ContextPipelineInterruptedError (
988+ f'Skipping URL { context .request .loaded_url } (redirected from { context .request .url } ): '
989+ f'{ UNSUPPORTED_SCHEME_MESSAGE } '
990+ )
991+
992+ if not matches_enqueue_strategy (
993+ strategy = context .request .enqueue_strategy ,
994+ origin_url = URL (context .request .url ),
995+ target_url = URL (context .request .loaded_url ),
996+ ):
997+ raise ContextPipelineInterruptedError (
998+ f'Skipping URL { context .request .loaded_url } (redirected from { context .request .url } )'
999+ )
9871000
9881001 yield context
9891002
@@ -1057,8 +1070,9 @@ def _enqueue_links_filter_iterator(
10571070 self .log .warning (f'Skipping enqueue: Missing hostname in origin_url = { origin_url } .' )
10581071 return
10591072
1060- # Emit a `warning` message to the log, only once per call
1061- warning_flag = True
1073+ # Each warning is emitted at most once per call.
1074+ host_warned = False
1075+ scheme_warned = False
10621076
10631077 for request in request_iterator :
10641078 if isinstance (request , Request ):
@@ -1069,9 +1083,15 @@ def _enqueue_links_filter_iterator(
10691083 target_url = request
10701084 parsed_target_url = URL (target_url )
10711085
1072- if warning_flag and strategy != 'all' and not parsed_target_url .host :
1086+ if not is_supported_url_scheme (parsed_target_url ):
1087+ if not scheme_warned :
1088+ self .log .warning (f'Skipping enqueue url { target_url !r} : { UNSUPPORTED_SCHEME_MESSAGE } ' )
1089+ scheme_warned = True
1090+ continue
1091+
1092+ if not host_warned and strategy != 'all' and not parsed_target_url .host :
10731093 self .log .warning (f'Skipping enqueue url: Missing hostname in target_url = { target_url } .' )
1074- warning_flag = False
1094+ host_warned = True
10751095
10761096 if matches_enqueue_strategy (
10771097 strategy = strategy ,
0 commit comments