1313if TYPE_CHECKING :
1414 from typing_extensions import Self
1515
16+ from crawlee ._types import EnqueueStrategy
1617 from crawlee .http_clients import HttpClient
1718 from crawlee .proxy_configuration import ProxyInfo
1819
2223
2324class RobotsTxtFile :
2425 def __init__ (
25- self , url : str , robots : Protego , http_client : HttpClient | None = None , proxy_info : ProxyInfo | None = None
26+ self ,
27+ url : str ,
28+ robots : Protego ,
29+ http_client : HttpClient | None = None ,
30+ proxy_info : ProxyInfo | None = None ,
2631 ) -> None :
2732 self ._robots = robots
2833 self ._original_url = URL (url ).origin ()
@@ -90,18 +95,29 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
9095 return True
9196 return bool (self ._robots .can_fetch (str (check_url ), user_agent ))
9297
93- def get_sitemaps (self ) -> list [str ]:
94- """Get the list of same-host sitemap URLs from the robots.txt file."""
95- same_host_sitemaps : list [str ] = []
98+ def get_sitemaps (self , * , enqueue_strategy : EnqueueStrategy ) -> list [str ]:
99+ """Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.
100+
101+ Args:
102+ enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
103+ Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
104+ disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
105+ always filtered out.
106+ """
107+ sitemaps : list [str ] = []
96108 for sitemap_url in self ._robots .sitemaps :
97- if matches_enqueue_strategy ('same-hostname' , target_url = sitemap_url , origin_url = self ._original_url ):
98- same_host_sitemaps .append (sitemap_url )
109+ if matches_enqueue_strategy (
110+ strategy = enqueue_strategy ,
111+ target_url = sitemap_url ,
112+ origin_url = self ._original_url ,
113+ ):
114+ sitemaps .append (sitemap_url )
99115 else :
100116 logger .warning (
101117 f'Skipping sitemap { sitemap_url !r} listed in robots.txt at { str (self ._original_url )!r} : '
102- f'cross-host sitemap entries are not allowed by the robots.txt specification .'
118+ f'does not match enqueue strategy { enqueue_strategy !r } .'
103119 )
104- return same_host_sitemaps
120+ return sitemaps
105121
106122 def get_crawl_delay (self , user_agent : str = '*' ) -> int | None :
107123 """Get the crawl delay for the given user agent.
@@ -113,15 +129,23 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
113129 crawl_delay = self ._robots .crawl_delay (user_agent )
114130 return int (crawl_delay ) if crawl_delay is not None else None
115131
116- async def parse_sitemaps (self ) -> Sitemap :
117- """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
118- sitemaps = self .get_sitemaps ()
132+ async def parse_sitemaps (self , * , enqueue_strategy : EnqueueStrategy ) -> Sitemap :
133+ """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.
134+
135+ Args:
136+ enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
137+ """
138+ sitemaps = self .get_sitemaps (enqueue_strategy = enqueue_strategy )
119139 if not self ._http_client :
120140 raise ValueError ('HTTP client is required to parse sitemaps.' )
121141
122142 return await Sitemap .load (sitemaps , self ._http_client , self ._proxy_info )
123143
124- async def parse_urls_from_sitemaps (self ) -> list [str ]:
125- """Parse the sitemaps in the robots.txt file and return a list URLs."""
126- sitemap = await self .parse_sitemaps ()
144+ async def parse_urls_from_sitemaps (self , * , enqueue_strategy : EnqueueStrategy ) -> list [str ]:
145+ """Parse the sitemaps in the robots.txt file and return a list URLs.
146+
147+ Args:
148+ enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
149+ """
150+ sitemap = await self .parse_sitemaps (enqueue_strategy = enqueue_strategy )
127151 return sitemap .urls
0 commit comments