From e5aff8630cbf64d259c6a6dbecea030dededdf3f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 16 Dec 2025 20:05:43 +0000 Subject: [PATCH 01/18] add docs "HttpCrawler with custom parser" --- .../httpcrawler_custom_parser/lxml_parser.py | 61 +++++++++++++++ .../lxml_saxonche_parser.py | 77 +++++++++++++++++++ .../pyquery_parser.py | 64 +++++++++++++++ .../scrapling_parser.py | 74 ++++++++++++++++++ .../selectolax_parser.py | 63 +++++++++++++++ docs/guides/httpcrawler_custom_parser.mdx | 67 ++++++++++++++++ pyproject.toml | 5 ++ 7 files changed, 411 insertions(+) create mode 100644 docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py create mode 100644 docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py create mode 100644 docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py create mode 100644 docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py create mode 100644 docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py create mode 100644 docs/guides/httpcrawler_custom_parser.mdx diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py new file mode 100644 index 0000000000..b50fda4293 --- /dev/null +++ b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py @@ -0,0 +1,61 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using lxml. + parsed_html = html.fromstring(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.findtext('.//title'), + 'h1s': [h1.text_content() for h1 in parsed_html.findall('.//h1')], + 'h2s': [h2.text_content() for h2 in parsed_html.findall('.//h2')], + 'h3s': [h3.text_content() for h3 in parsed_html.findall('.//h3')], + } + await context.push_data(data) + + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + + # Xpath 1.0 selector for extracting valid href attributes. + links_xpath = ( + '//a/@href[not(starts-with(., "#")) ' + 'and not(starts-with(., "javascript:")) ' + 'and not(starts-with(., "mailto:"))]' + ) + + extracted_requests = [] + + # Extract links. + for url in parsed_html.xpath(links_xpath): + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py new file mode 100644 index 0000000000..ac839a6164 --- /dev/null +++ b/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py @@ -0,0 +1,77 @@ +import asyncio + +from lxml import html +from pydantic import ValidationError +from saxonche import PySaxonProcessor + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + # Create Saxon processor once and reuse across requests. + saxon_proc = PySaxonProcessor(license=False) + xpath_proc = saxon_proc.new_xpath_processor() + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse HTML with lxml. + parsed_html = html.fromstring(await context.http_response.read()) + # Convert relative URLs to absolute before extracting links. + parsed_html.make_links_absolute(context.request.url, resolve_base_href=True) + # Convert parsed HTML to XML for Saxon processing. + xml = html.tostring(parsed_html, encoding='unicode', method='xml') + # Parse XML with Saxon. + parsed_xml = saxon_proc.parse_xml(xml_text=xml) + # Set the parsed context for XPath evaluation. + xpath_proc.set_context(xdm_item=parsed_xml) + + # Extract data using XPath 2.0 string() function. + data = { + 'url': context.request.url, + 'title': xpath_proc.evaluate_single('.//title/string()'), + 'h1s': [str(h) for h in (xpath_proc.evaluate('//h1/string()') or [])], + 'h2s': [str(h) for h in (xpath_proc.evaluate('//h2/string()') or [])], + 'h3s': [str(h) for h in (xpath_proc.evaluate('//h3/string()') or [])], + } + await context.push_data(data) + + # XPath 2.0 with distinct-values() to get unique links and remove fragments. + links_xpath = """ + distinct-values( + for $href in //a/@href[ + not(starts-with(., "#")) + and not(starts-with(., "javascript:")) + and not(starts-with(., "mailto:")) + ] + return replace($href, "#.*$", "") + ) + """ + + extracted_requests = [] + + # Extract links. + for item in xpath_proc.evaluate(links_xpath) or []: + url = item.string_value + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py new file mode 100644 index 0000000000..1e15e9cb5b --- /dev/null +++ b/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py @@ -0,0 +1,64 @@ +import asyncio + +from pydantic import ValidationError +from pyquery import PyQuery +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using PyQuery. + parsed_html = PyQuery(await context.http_response.read()) + + # Extract data using jQuery-style selectors. + data = { + 'url': context.request.url, + 'title': parsed_html('title').text(), + 'h1s': [h1.text() for h1 in parsed_html('h1').items()], + 'h2s': [h2.text() for h2 in parsed_html('h2').items()], + 'h3s': [h3.text() for h3 in parsed_html('h3').items()], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + + extracted_requests = [] + + # Extract links. + for item in parsed_html(links_selector).items(): + href = item.attr('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(str(href)))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py new file mode 100644 index 0000000000..201b9b0cbf --- /dev/null +++ b/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py @@ -0,0 +1,74 @@ +import asyncio + +from pydantic import ValidationError +from scrapling.parser import Selector +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Scrapling. + page = Selector(await context.http_response.read(), url=context.request.url) + + # Extract data using Xpath selectors with .get_all_text method for full text + # content. + title_el = page.xpath_first('//title') + data = { + 'url': context.request.url, + 'title': title_el.text if isinstance(title_el, Selector) else title_el, + 'h1s': [ + h1.get_all_text() if isinstance(h1, Selector) else h1 + for h1 in page.xpath('//h1') + ], + 'h2s': [ + h2.get_all_text() if isinstance(h2, Selector) else h2 + for h2 in page.xpath('//h2') + ], + 'h3s': [ + h3.get_all_text() if isinstance(h3, Selector) else h3 + for h3 in page.xpath('//h3') + ], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in page.css(links_selector): + href = item.attrib.get('href') if isinstance(item, Selector) else None + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py b/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py new file mode 100644 index 0000000000..ef279793ed --- /dev/null +++ b/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py @@ -0,0 +1,63 @@ +import asyncio + +from pydantic import ValidationError +from selectolax.lexbor import LexborHTMLParser +from yarl import URL + +from crawlee import Request +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=1, + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Parse the HTML content using Selectolax with Lexbor backend. + parsed_html = LexborHTMLParser(await context.http_response.read()) + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': parsed_html.css_first('title').text(), + 'h1s': [h1.text() for h1 in parsed_html.css('h1')], + 'h2s': [h2.text() for h2 in parsed_html.css('h2')], + 'h3s': [h3.text() for h3 in parsed_html.css('h3')], + } + await context.push_data(data) + + # Css selector to extract valid href attributes. + links_selector = ( + 'a[href]:not([href^="#"]):not([href^="javascript:"]):not([href^="mailto:"])' + ) + base_url = URL(context.request.url) + extracted_requests = [] + + # Extract links. + for item in parsed_html.css(links_selector): + href = item.attributes.get('href') + if not href: + continue + + # Convert relative URLs to absolute if needed. + url = str(base_url.join(URL(href))) + try: + request = Request.from_url(url) + except ValidationError as exc: + context.log.warning(f'Skipping invalid URL "{url}": {exc}') + continue + extracted_requests.append(request) + + # Add extracted requests to the queue with the same-domain strategy. + await context.add_requests(extracted_requests, strategy='same-domain') + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/httpcrawler_custom_parser.mdx b/docs/guides/httpcrawler_custom_parser.mdx new file mode 100644 index 0000000000..a27e0da96e --- /dev/null +++ b/docs/guides/httpcrawler_custom_parser.mdx @@ -0,0 +1,67 @@ +--- +id: httpcrawler-with-custom-parser +title: HttpCrawler with custom parser +description: Learn how to use HttpCrawler with third-party parsing libraries instead of BeautifulSoup and Parsel. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import LxmlParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/lxml_parser.py'; +import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py'; +import SelectolaxParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/selectolax_parser.py'; +import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/pyquery_parser.py'; +import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/scrapling_parser.py'; + +Crawlee provides `BeautifulSoupCrawler` and `ParselCrawler` as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs. The `HttpCrawler` gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. + +When using `HttpCrawler` with a custom parser, helpers like `enqueue_links` and `extract_links` are not available. For deeper integration, use the `ParselCrawler` implementation as inspiration. + +The following sections demonstrate how to use various parsing libraries with `HttpCrawler` to extract data from a page and enqueue discovered links for further crawling. + +## lxml + +[lxml](https://lxml.de/) is a high-performance XML and HTML parser that provides Python bindings to the C libraries libxml2 and libxslt. It supports XPath 1.0, XSLT 1.0, and EXSLT extensions for element selection. The `make_links_absolute` method is particularly useful for converting relative URLs to absolute ones before link extraction. + + + {LxmlParser} + + +## lxml with SaxonC-HE + +Using [SaxonC-HE](https://pypi.org/project/saxonche/) together with lxml enables XPath 3.1 support, which provides advanced features like `distinct-values()` function and more powerful string manipulation. In this setup, lxml converts HTML to well-formed XML that SaxonC-HE can process. + + + {LxmlSaxoncheParser} + + +## selectolax + +[selectolax](https://github.com/rushter/selectolax) is a fast HTML parser that offers two backends: the default `Modest` engine and `Lexbor`. It provides a simple API with CSS selector support. The example below uses the `Lexbor` backend for optimal performance. + + + {SelectolaxParser} + + +## PyQuery + +[PyQuery](https://pyquery.readthedocs.io/) brings jQuery-like syntax to Python for HTML manipulation. Built on top of `lxml`, it combines familiar jQuery CSS selectors with Python's ease of use. This is a good choice if you're comfortable with jQuery syntax and want a straightforward API for DOM traversal and manipulation. + + + {PyqueryParser} + + +## Scrapling + +[Scrapling](https://github.com/D4Vinci/Scrapling) is scraping library that provides both CSS selectors and XPath 1.0. It offers automatic text extraction and a Scrapy/BeautifulSoup-like API with pseudo-elements support similar to Parsel. + + + {ScraplingParser} + + +## Conclusion + +`HttpCrawler` provides a solid foundation for integrating any parsing library from the Python ecosystem. By giving you direct access to HTTP responses, it allows you to leverage the parser you're most comfortable with or the one that best fits your specific requirements. Whether you need XPath support, CSS selectors, or specialized parsing features, `HttpCrawler` makes it straightforward to incorporate your preferred tool into your scraping workflow. diff --git a/pyproject.toml b/pyproject.toml index b859a820a9..09f7056d8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -256,6 +256,9 @@ module = [ "apify_fingerprint_datapoints", # Untyped and stubs not available "camoufox", # Example code shows integration of camoufox and crawlee. "fastapi", # Example code shows running in webserver. + "saxonche", # Example code shows HttpCrawler with custom parser. + "scrapling", # Example code shows HttpCrawler with custom parser. + "selectolax.*", # Example code shows HttpCrawler with custom parser. "stagehand.*", # Example code shows integration of Stagehand and crawlee. "starlette.*", # Example code shows running in webserver. "flask", # Example code shows deploy on Google Cloud. @@ -263,9 +266,11 @@ module = [ "jaro", # Untyped and stubs not available "litestar", # Example code shows deploy on Google Cloud Run. "loguru", # Example code shows integration of loguru and crawlee for JSON logging. + "lxml.*", # Example code shows HttpCrawler with custom parser. "sklearn.linear_model", # Untyped and stubs not available "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available + "pyquery", # Example code shows HttpCrawler with custom parser. "warcio.*", # Example code shows WARC files creation. "wrapt" # Untyped and stubs not available ] From 247ef79495e2720744a05ca409adaa9727406c83 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 16 Dec 2025 20:12:50 +0000 Subject: [PATCH 02/18] fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 09f7056d8b..19795b833a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -257,7 +257,7 @@ module = [ "camoufox", # Example code shows integration of camoufox and crawlee. "fastapi", # Example code shows running in webserver. "saxonche", # Example code shows HttpCrawler with custom parser. - "scrapling", # Example code shows HttpCrawler with custom parser. + "scrapling.*", # Example code shows HttpCrawler with custom parser. "selectolax.*", # Example code shows HttpCrawler with custom parser. "stagehand.*", # Example code shows integration of Stagehand and crawlee. "starlette.*", # Example code shows running in webserver. From 758424bb52b71015d5b1ab743928489cb92aca36 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 18 Dec 2025 03:24:38 +0000 Subject: [PATCH 03/18] add `AbstractHttpCrawler` section --- .../crawler_custom_parser/__init__.py | 0 .../lexbor_parser.py} | 0 .../lxml_parser.py | 0 .../lxml_saxonche_parser.py | 0 .../pyquery_parser.py | 0 .../scrapling_parser.py | 0 .../selectolax_context.py | 33 +++++ .../selectolax_crawler.py | 42 +++++++ .../selectolax_crawler_run.py | 27 ++++ .../selectolax_parser.py | 60 +++++++++ docs/guides/crawler_custom_parser.mdx | 117 ++++++++++++++++++ docs/guides/httpcrawler_custom_parser.mdx | 67 ---------- selectolax_crawler_run.py | 27 ++++ 13 files changed, 306 insertions(+), 67 deletions(-) create mode 100644 docs/guides/code_examples/crawler_custom_parser/__init__.py rename docs/guides/code_examples/{httpcrawler_custom_parser/selectolax_parser.py => crawler_custom_parser/lexbor_parser.py} (100%) rename docs/guides/code_examples/{httpcrawler_custom_parser => crawler_custom_parser}/lxml_parser.py (100%) rename docs/guides/code_examples/{httpcrawler_custom_parser => crawler_custom_parser}/lxml_saxonche_parser.py (100%) rename docs/guides/code_examples/{httpcrawler_custom_parser => crawler_custom_parser}/pyquery_parser.py (100%) rename docs/guides/code_examples/{httpcrawler_custom_parser => crawler_custom_parser}/scrapling_parser.py (100%) create mode 100644 docs/guides/code_examples/crawler_custom_parser/selectolax_context.py create mode 100644 docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py create mode 100644 docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py create mode 100644 docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py create mode 100644 docs/guides/crawler_custom_parser.mdx delete mode 100644 docs/guides/httpcrawler_custom_parser.mdx create mode 100644 selectolax_crawler_run.py diff --git a/docs/guides/code_examples/crawler_custom_parser/__init__.py b/docs/guides/code_examples/crawler_custom_parser/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py b/docs/guides/code_examples/crawler_custom_parser/lexbor_parser.py similarity index 100% rename from docs/guides/code_examples/httpcrawler_custom_parser/selectolax_parser.py rename to docs/guides/code_examples/crawler_custom_parser/lexbor_parser.py diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py b/docs/guides/code_examples/crawler_custom_parser/lxml_parser.py similarity index 100% rename from docs/guides/code_examples/httpcrawler_custom_parser/lxml_parser.py rename to docs/guides/code_examples/crawler_custom_parser/lxml_parser.py diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py b/docs/guides/code_examples/crawler_custom_parser/lxml_saxonche_parser.py similarity index 100% rename from docs/guides/code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py rename to docs/guides/code_examples/crawler_custom_parser/lxml_saxonche_parser.py diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py b/docs/guides/code_examples/crawler_custom_parser/pyquery_parser.py similarity index 100% rename from docs/guides/code_examples/httpcrawler_custom_parser/pyquery_parser.py rename to docs/guides/code_examples/crawler_custom_parser/pyquery_parser.py diff --git a/docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py b/docs/guides/code_examples/crawler_custom_parser/scrapling_parser.py similarity index 100% rename from docs/guides/code_examples/httpcrawler_custom_parser/scrapling_parser.py rename to docs/guides/code_examples/crawler_custom_parser/scrapling_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py new file mode 100644 index 0000000000..8fe36c039c --- /dev/null +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, fields + +from selectolax.lexbor import LexborHTMLParser +from typing_extensions import Self + +from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +@dataclass(frozen=True) +class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): + """Crawling context providing access to the parsed page. + + This context is passed to request handlers and includes all standard + context methods (push_data, enqueue_links, etc.) plus custom helpers. + """ + + @property + def parser(self) -> LexborHTMLParser: + """Convenient alias for accessing the parsed document.""" + return self.parsed_content + + @classmethod + def from_parsed_http_crawling_context( + cls, context: ParsedHttpCrawlingContext[LexborHTMLParser] + ) -> Self: + """Create custom context from the base context. + + Copies all fields from the base context to preserve framework + functionality while adding custom interface. + """ + return cls( + **{field.name: getattr(context, field.name) for field in fields(context)} + ) diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py new file mode 100644 index 0000000000..d5efc466e6 --- /dev/null +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode + +from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions + +from .selectolax_context import SelectolaxLexborContext +from .selectolax_parser import SelectolaxLexborParser + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from typing_extensions import Unpack + + from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext + + +class SelectolaxLexborCrawler( + AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode] +): + """Custom crawler using Selectolax Lexbor for HTML parsing.""" + + def __init__( + self, + **kwargs: Unpack[HttpCrawlerOptions[SelectolaxLexborContext]], + ) -> None: + # Final step converts the base context to custom context type. + async def final_step( + context: ParsedHttpCrawlingContext[LexborHTMLParser], + ) -> AsyncGenerator[SelectolaxLexborContext, None]: + yield SelectolaxLexborContext.from_parsed_http_crawling_context(context) + + # Build context pipeline: HTTP request -> parsing -> custom context. + kwargs['_context_pipeline'] = ( + self._create_static_content_crawler_pipeline().compose(final_step) + ) + super().__init__( + parser=SelectolaxLexborParser(), + **kwargs, + ) diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py new file mode 100644 index 0000000000..52c25ac4da --- /dev/null +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py @@ -0,0 +1,27 @@ +import asyncio + +from .selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler + + +async def main() -> None: + crawler = SelectolaxLexborCrawler( + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def handle_request(context: SelectolaxLexborContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.parser.css_first('title').text(), + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py new file mode 100644 index 0000000000..2e4f5f0f6f --- /dev/null +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +from selectolax.lexbor import LexborHTMLParser, LexborNode +from typing_extensions import override + +from crawlee.crawlers._abstract_http import AbstractHttpParser + +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + + from crawlee.http_clients import HttpResponse + + +class SelectolaxLexborParser(AbstractHttpParser[LexborHTMLParser, LexborNode]): + """Parser for parsing HTTP response using Selectolax Lexbor.""" + + @override + async def parse(self, response: HttpResponse) -> LexborHTMLParser: + """Parse HTTP response body into a document object.""" + response_body = await response.read() + # Run parsing in a thread to avoid blocking the event loop. + return await asyncio.to_thread(lambda: LexborHTMLParser(response_body)) + + @override + async def parse_text(self, text: str) -> LexborHTMLParser: + """Parse raw HTML string into a document object.""" + return LexborHTMLParser(text) + + @override + async def select( + self, parsed_content: LexborHTMLParser, selector: str + ) -> Sequence[LexborNode]: + """Select elements matching a CSS selector.""" + return tuple(match for match in parsed_content.css(selector)) + + @override + def is_matching_selector( + self, parsed_content: LexborHTMLParser, selector: str + ) -> bool: + """Check if any element matches the selector.""" + return parsed_content.css_first(selector) is not None + + @override + def find_links( + self, parsed_content: LexborHTMLParser, selector: str + ) -> Iterable[str]: + """Extract href attributes from elements matching the selector. + + Used by `enqueue_links` helper to discover URLs. + """ + link: LexborNode + urls: list[str] = [] + for link in parsed_content.css(selector): + url = link.attributes.get('href') + if url: + urls.append(url.strip()) + return urls diff --git a/docs/guides/crawler_custom_parser.mdx b/docs/guides/crawler_custom_parser.mdx new file mode 100644 index 0000000000..b4ef37e7c2 --- /dev/null +++ b/docs/guides/crawler_custom_parser.mdx @@ -0,0 +1,117 @@ +--- +id: crawler-with-custom-parser +title: Crawler with custom parser +description: Learn how to use HttpCrawler with third-party parsing libraries and how to create a custom crawler with full framework integration. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import LxmlParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lxml_parser.py'; +import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lxml_saxonche_parser.py'; +import LexborParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lexbor_parser.py'; +import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/pyquery_parser.py'; +import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/scrapling_parser.py'; + +import SelectolaxParserSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_parser.py'; +import SelectolaxContextSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_context.py'; +import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler.py'; +import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler_run.py'; + +Crawlee provides `BeautifulSoupCrawler` and `ParselCrawler` as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs. + +There are two approaches to integrate a custom parser: + +- **Using `HttpCrawler`** — Parse raw responses directly in request handlers. Quick to set up, but helpers like `enqueue_links` are not available. Best for simple scraping tasks or quick prototyping. +- **Creating a custom crawler** — Implement a crawler based on `AbstractHttpCrawler` for full framework integration. Best for reusable crawlers or when you need full integration and support in `AdaptivePlaywrightCrawler`. + +## Using HttpCrawler with custom parser + +The `HttpCrawler` gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. When using this approach, helpers like `enqueue_links` and `extract_links` are not available, and it requires minimal setup. + +The following sections demonstrate how to use various parsing libraries with `HttpCrawler` to extract data from a page and enqueue discovered links for further crawling. + +### lxml + +[lxml](https://lxml.de/) is a high-performance XML and HTML parser that provides Python bindings to the C libraries libxml2 and libxslt. It supports XPath 1.0, XSLT 1.0, and EXSLT extensions for element selection. The `make_links_absolute` method is particularly useful for converting relative URLs to absolute ones before link extraction. + + + {LxmlParser} + + +### lxml with SaxonC-HE + +Using [SaxonC-HE](https://pypi.org/project/saxonche/) together with lxml enables XPath 3.1 support, which provides advanced features like `distinct-values()` function and more powerful string manipulation. In this setup, lxml converts HTML to well-formed XML that SaxonC-HE can process. + + + {LxmlSaxoncheParser} + + +### selectolax + +[selectolax](https://github.com/rushter/selectolax) is a fast HTML parser that offers two backends: the default `Modest` engine and `Lexbor`. It provides a simple API with CSS selector support. The example below uses the `Lexbor` backend for optimal performance. + + + {LexborParser} + + +### PyQuery + +[PyQuery](https://pyquery.readthedocs.io/) brings jQuery-like syntax to Python for HTML manipulation. Built on top of `lxml`, it combines familiar jQuery CSS selectors with Python's ease of use. This is a good choice if you're comfortable with jQuery syntax and want a straightforward API for DOM traversal and manipulation. + + + {PyqueryParser} + + +### Scrapling + +[Scrapling](https://github.com/D4Vinci/Scrapling) is a scraping library that provides both CSS selectors and XPath 1.0. It offers automatic text extraction and a Scrapy/BeautifulSoup-like API with pseudo-elements support similar to Parsel. + + + {ScraplingParser} + + +## Creating a custom crawler + +For deeper integration with full access to framework helpers like `enqueue_links`, you can create a custom crawler based on `AbstractHttpCrawler`. This approach requires implementing three components — a parser, a crawling context, and the crawler class — but provides a seamless experience similar to built-in crawlers. + +The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. + +### Implementing the parser + +The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: + + + {SelectolaxParserSource} + + +### Defining the crawling context + +The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with: + + + {SelectolaxContextSource} + + +### Building the crawler + +The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: + + + {SelectolaxCrawlerSource} + + +### Using the crawler + +The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`: + + + {SelectolaxCrawlerRunSource} + + +## Conclusion + +Crawlee offers flexible options for integrating custom parsing libraries. Use `HttpCrawler` for quick integration when you need to parse responses with your preferred library. For full framework integration with helpers like `enqueue_links`, implement a custom crawler using `AbstractHttpCrawler`. Both approaches allow you to leverage any parser from the Python ecosystem while benefiting from Crawlee's request management, rate limiting, and data storage features. diff --git a/docs/guides/httpcrawler_custom_parser.mdx b/docs/guides/httpcrawler_custom_parser.mdx deleted file mode 100644 index a27e0da96e..0000000000 --- a/docs/guides/httpcrawler_custom_parser.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -id: httpcrawler-with-custom-parser -title: HttpCrawler with custom parser -description: Learn how to use HttpCrawler with third-party parsing libraries instead of BeautifulSoup and Parsel. ---- - -import ApiLink from '@site/src/components/ApiLink'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import CodeBlock from '@theme/CodeBlock'; -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; - -import LxmlParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/lxml_parser.py'; -import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/lxml_saxonche_parser.py'; -import SelectolaxParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/selectolax_parser.py'; -import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/pyquery_parser.py'; -import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/httpcrawler_custom_parser/scrapling_parser.py'; - -Crawlee provides `BeautifulSoupCrawler` and `ParselCrawler` as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs. The `HttpCrawler` gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. - -When using `HttpCrawler` with a custom parser, helpers like `enqueue_links` and `extract_links` are not available. For deeper integration, use the `ParselCrawler` implementation as inspiration. - -The following sections demonstrate how to use various parsing libraries with `HttpCrawler` to extract data from a page and enqueue discovered links for further crawling. - -## lxml - -[lxml](https://lxml.de/) is a high-performance XML and HTML parser that provides Python bindings to the C libraries libxml2 and libxslt. It supports XPath 1.0, XSLT 1.0, and EXSLT extensions for element selection. The `make_links_absolute` method is particularly useful for converting relative URLs to absolute ones before link extraction. - - - {LxmlParser} - - -## lxml with SaxonC-HE - -Using [SaxonC-HE](https://pypi.org/project/saxonche/) together with lxml enables XPath 3.1 support, which provides advanced features like `distinct-values()` function and more powerful string manipulation. In this setup, lxml converts HTML to well-formed XML that SaxonC-HE can process. - - - {LxmlSaxoncheParser} - - -## selectolax - -[selectolax](https://github.com/rushter/selectolax) is a fast HTML parser that offers two backends: the default `Modest` engine and `Lexbor`. It provides a simple API with CSS selector support. The example below uses the `Lexbor` backend for optimal performance. - - - {SelectolaxParser} - - -## PyQuery - -[PyQuery](https://pyquery.readthedocs.io/) brings jQuery-like syntax to Python for HTML manipulation. Built on top of `lxml`, it combines familiar jQuery CSS selectors with Python's ease of use. This is a good choice if you're comfortable with jQuery syntax and want a straightforward API for DOM traversal and manipulation. - - - {PyqueryParser} - - -## Scrapling - -[Scrapling](https://github.com/D4Vinci/Scrapling) is scraping library that provides both CSS selectors and XPath 1.0. It offers automatic text extraction and a Scrapy/BeautifulSoup-like API with pseudo-elements support similar to Parsel. - - - {ScraplingParser} - - -## Conclusion - -`HttpCrawler` provides a solid foundation for integrating any parsing library from the Python ecosystem. By giving you direct access to HTTP responses, it allows you to leverage the parser you're most comfortable with or the one that best fits your specific requirements. Whether you need XPath support, CSS selectors, or specialized parsing features, `HttpCrawler` makes it straightforward to incorporate your preferred tool into your scraping workflow. diff --git a/selectolax_crawler_run.py b/selectolax_crawler_run.py new file mode 100644 index 0000000000..65e9186bf8 --- /dev/null +++ b/selectolax_crawler_run.py @@ -0,0 +1,27 @@ +import asyncio + +from test_selectolax.selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler + + +async def main() -> None: + crawler = SelectolaxLexborCrawler( + max_requests_per_crawl=10, + ) + + @crawler.router.default_handler + async def handle_request(context: SelectolaxLexborContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + data = { + 'url': context.request.url, + 'title': context.parser.css_first('title').text(), + } + + await context.push_data(data) + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) From 7a9e092dea17ff95b87be3748f2a3a810ac7a8c1 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 18 Dec 2025 03:26:32 +0000 Subject: [PATCH 04/18] del extra file --- selectolax_crawler_run.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 selectolax_crawler_run.py diff --git a/selectolax_crawler_run.py b/selectolax_crawler_run.py deleted file mode 100644 index 65e9186bf8..0000000000 --- a/selectolax_crawler_run.py +++ /dev/null @@ -1,27 +0,0 @@ -import asyncio - -from test_selectolax.selectolax_crawler import SelectolaxLexborContext, SelectolaxLexborCrawler - - -async def main() -> None: - crawler = SelectolaxLexborCrawler( - max_requests_per_crawl=10, - ) - - @crawler.router.default_handler - async def handle_request(context: SelectolaxLexborContext) -> None: - context.log.info(f'Processing {context.request.url} ...') - - data = { - 'url': context.request.url, - 'title': context.parser.css_first('title').text(), - } - - await context.push_data(data) - await context.enqueue_links() - - await crawler.run(['https://crawlee.dev/']) - - -if __name__ == '__main__': - asyncio.run(main()) From a895901b7d699e9e185a4bb21182c6a9c754fabe Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 19 Dec 2025 00:11:37 +0000 Subject: [PATCH 05/18] add AdaptivePlaywrightCrawler example --- .../selectolax_adaptive_run.py | 38 ++++++++ .../selectolax_context.py | 2 + docs/guides/crawler_custom_parser.mdx | 89 +++++++++---------- src/crawlee/crawlers/__init__.py | 2 + .../crawlers/_adaptive_playwright/__init__.py | 3 + 5 files changed, 89 insertions(+), 45 deletions(-) create mode 100644 docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py new file mode 100644 index 0000000000..1419a51fed --- /dev/null +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py @@ -0,0 +1,38 @@ +import asyncio + +from crawlee.crawlers import ( + AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlerStatisticState, + AdaptivePlaywrightCrawlingContext, +) +from crawlee.statistics import Statistics + +from .selectolax_parser import SelectolaxLexborParser + + +async def main() -> None: + crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( + max_requests_per_crawl=10, + # Use custom Selectolax parser for static content parsing. + static_parser=SelectolaxLexborParser(), + # Set up statistics with AdaptivePlaywrightCrawlerStatisticState. + statistics=Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState), + ) + + @crawler.router.default_handler + async def handle_request(context: AdaptivePlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + data = { + 'url': context.request.url, + 'title': await context.query_selector_one('title'), + } + + await context.push_data(data) + + await context.enqueue_links() + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py index 8fe36c039c..24c7bc7755 100644 --- a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py @@ -14,6 +14,8 @@ class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): context methods (push_data, enqueue_links, etc.) plus custom helpers. """ + # It is only for convenience and not strictly necessary, as the + # parsed_content field is already available from the base class. @property def parser(self) -> LexborHTMLParser: """Convenient alias for accessing the parsed document.""" diff --git a/docs/guides/crawler_custom_parser.mdx b/docs/guides/crawler_custom_parser.mdx index b4ef37e7c2..0995a7816c 100644 --- a/docs/guides/crawler_custom_parser.mdx +++ b/docs/guides/crawler_custom_parser.mdx @@ -20,6 +20,7 @@ import SelectolaxParserSource from '!!raw-loader!./code_examples/crawler_custom_ import SelectolaxContextSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_context.py'; import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler.py'; import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler_run.py'; +import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_adaptive_run.py'; Crawlee provides `BeautifulSoupCrawler` and `ParselCrawler` as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs. @@ -32,47 +33,35 @@ There are two approaches to integrate a custom parser: The `HttpCrawler` gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. When using this approach, helpers like `enqueue_links` and `extract_links` are not available, and it requires minimal setup. -The following sections demonstrate how to use various parsing libraries with `HttpCrawler` to extract data from a page and enqueue discovered links for further crawling. - -### lxml - -[lxml](https://lxml.de/) is a high-performance XML and HTML parser that provides Python bindings to the C libraries libxml2 and libxslt. It supports XPath 1.0, XSLT 1.0, and EXSLT extensions for element selection. The `make_links_absolute` method is particularly useful for converting relative URLs to absolute ones before link extraction. - - - {LxmlParser} - - -### lxml with SaxonC-HE - -Using [SaxonC-HE](https://pypi.org/project/saxonche/) together with lxml enables XPath 3.1 support, which provides advanced features like `distinct-values()` function and more powerful string manipulation. In this setup, lxml converts HTML to well-formed XML that SaxonC-HE can process. - - - {LxmlSaxoncheParser} - - -### selectolax - -[selectolax](https://github.com/rushter/selectolax) is a fast HTML parser that offers two backends: the default `Modest` engine and `Lexbor`. It provides a simple API with CSS selector support. The example below uses the `Lexbor` backend for optimal performance. - - - {LexborParser} - - -### PyQuery - -[PyQuery](https://pyquery.readthedocs.io/) brings jQuery-like syntax to Python for HTML manipulation. Built on top of `lxml`, it combines familiar jQuery CSS selectors with Python's ease of use. This is a good choice if you're comfortable with jQuery syntax and want a straightforward API for DOM traversal and manipulation. - - - {PyqueryParser} - - -### Scrapling - -[Scrapling](https://github.com/D4Vinci/Scrapling) is a scraping library that provides both CSS selectors and XPath 1.0. It offers automatic text extraction and a Scrapy/BeautifulSoup-like API with pseudo-elements support similar to Parsel. - - - {ScraplingParser} - +The following examples demonstrate integration with various parsing libraries: [lxml](https://lxml.de/) for high-performance XPath 1.0 parsing, [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) for XPath 3.1 support, [selectolax](https://github.com/rushter/selectolax) for fast CSS selector-based parsing, [PyQuery](https://pyquery.readthedocs.io/) for jQuery-like syntax, and [scrapling](https://github.com/D4Vinci/Scrapling) for CSS and XPath selectors with Scrapy/Parsel-like API and BeautifulSoup-style find methods. + + + + + {LxmlParser} + + + + + {LxmlSaxoncheParser} + + + + + {LexborParser} + + + + + {PyqueryParser} + + + + + {ScraplingParser} + + + ## Creating a custom crawler @@ -106,11 +95,21 @@ The crawler class connects the parser and context. Extend `enqueue_links`: +The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: + + + + + {SelectolaxCrawlerRunSource} + + + + + {AdaptiveCrawlerRunSource} + + + - - {SelectolaxCrawlerRunSource} - ## Conclusion diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py index ec280f94c4..2f718936a5 100644 --- a/src/crawlee/crawlers/__init__.py +++ b/src/crawlee/crawlers/__init__.py @@ -23,12 +23,14 @@ 'AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', + 'AdaptivePlaywrightCrawlerStatisticState', 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor', ): from ._adaptive_playwright import ( AdaptivePlaywrightCrawler, + AdaptivePlaywrightCrawlerStatisticState, AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, RenderingType, diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py index fccf9590a1..3495acad50 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/__init__.py +++ b/src/crawlee/crawlers/_adaptive_playwright/__init__.py @@ -15,9 +15,12 @@ from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor with _try_import(__name__, 'BeautifulSoupCrawlingContext'): from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler +with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'): + from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState __all__ = [ 'AdaptivePlaywrightCrawler', + 'AdaptivePlaywrightCrawlerStatisticState', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext', 'RenderingType', From 5844f4b7c69190b675e0c8b7a5530745e4e8ac95 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:27:23 +0200 Subject: [PATCH 06/18] Update docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py Co-authored-by: Jan Buchar --- .../crawler_custom_parser/selectolax_adaptive_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py b/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py index 1419a51fed..8bcd919602 100644 --- a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py +++ b/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py @@ -11,7 +11,7 @@ async def main() -> None: - crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( + crawler = AdaptivePlaywrightCrawler( max_requests_per_crawl=10, # Use custom Selectolax parser for static content parsing. static_parser=SelectolaxLexborParser(), From 7fe669eeb2dec81cafdc4ac210c9175fb4971172 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 19 Dec 2025 15:24:58 +0000 Subject: [PATCH 07/18] integrate to HTTP crawlers guide --- .../__init__.py | 0 .../lexbor_parser.py | 0 .../lxml_parser.py | 0 .../lxml_saxonche_parser.py | 0 .../pyquery_parser.py | 0 .../scrapling_parser.py | 0 .../selectolax_adaptive_run.py | 2 +- .../selectolax_context.py | 2 - .../selectolax_crawler.py | 0 .../selectolax_crawler_run.py | 0 .../selectolax_parser.py | 0 docs/guides/crawler_custom_parser.mdx | 116 ------------------ docs/guides/http_crawlers.mdx | 96 ++++++++++++++- 13 files changed, 95 insertions(+), 121 deletions(-) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/__init__.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/lexbor_parser.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/lxml_parser.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/lxml_saxonche_parser.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/pyquery_parser.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/scrapling_parser.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/selectolax_adaptive_run.py (94%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/selectolax_context.py (89%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/selectolax_crawler.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/selectolax_crawler_run.py (100%) rename docs/guides/code_examples/{crawler_custom_parser => http_crawlers}/selectolax_parser.py (100%) delete mode 100644 docs/guides/crawler_custom_parser.mdx diff --git a/docs/guides/code_examples/crawler_custom_parser/__init__.py b/docs/guides/code_examples/http_crawlers/__init__.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/__init__.py rename to docs/guides/code_examples/http_crawlers/__init__.py diff --git a/docs/guides/code_examples/crawler_custom_parser/lexbor_parser.py b/docs/guides/code_examples/http_crawlers/lexbor_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/lexbor_parser.py rename to docs/guides/code_examples/http_crawlers/lexbor_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/lxml_parser.py b/docs/guides/code_examples/http_crawlers/lxml_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/lxml_parser.py rename to docs/guides/code_examples/http_crawlers/lxml_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/lxml_saxonche_parser.py b/docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/lxml_saxonche_parser.py rename to docs/guides/code_examples/http_crawlers/lxml_saxonche_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/pyquery_parser.py b/docs/guides/code_examples/http_crawlers/pyquery_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/pyquery_parser.py rename to docs/guides/code_examples/http_crawlers/pyquery_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/scrapling_parser.py b/docs/guides/code_examples/http_crawlers/scrapling_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/scrapling_parser.py rename to docs/guides/code_examples/http_crawlers/scrapling_parser.py diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py similarity index 94% rename from docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py rename to docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py index 8bcd919602..1419a51fed 100644 --- a/docs/guides/code_examples/crawler_custom_parser/selectolax_adaptive_run.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_adaptive_run.py @@ -11,7 +11,7 @@ async def main() -> None: - crawler = AdaptivePlaywrightCrawler( + crawler: AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler( max_requests_per_crawl=10, # Use custom Selectolax parser for static content parsing. static_parser=SelectolaxLexborParser(), diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py b/docs/guides/code_examples/http_crawlers/selectolax_context.py similarity index 89% rename from docs/guides/code_examples/crawler_custom_parser/selectolax_context.py rename to docs/guides/code_examples/http_crawlers/selectolax_context.py index 24c7bc7755..8fe36c039c 100644 --- a/docs/guides/code_examples/crawler_custom_parser/selectolax_context.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_context.py @@ -14,8 +14,6 @@ class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): context methods (push_data, enqueue_links, etc.) plus custom helpers. """ - # It is only for convenience and not strictly necessary, as the - # parsed_content field is already available from the base class. @property def parser(self) -> LexborHTMLParser: """Convenient alias for accessing the parsed document.""" diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/selectolax_crawler.py rename to docs/guides/code_examples/http_crawlers/selectolax_crawler.py diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/selectolax_crawler_run.py rename to docs/guides/code_examples/http_crawlers/selectolax_crawler_run.py diff --git a/docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py similarity index 100% rename from docs/guides/code_examples/crawler_custom_parser/selectolax_parser.py rename to docs/guides/code_examples/http_crawlers/selectolax_parser.py diff --git a/docs/guides/crawler_custom_parser.mdx b/docs/guides/crawler_custom_parser.mdx deleted file mode 100644 index 0995a7816c..0000000000 --- a/docs/guides/crawler_custom_parser.mdx +++ /dev/null @@ -1,116 +0,0 @@ ---- -id: crawler-with-custom-parser -title: Crawler with custom parser -description: Learn how to use HttpCrawler with third-party parsing libraries and how to create a custom crawler with full framework integration. ---- - -import ApiLink from '@site/src/components/ApiLink'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import CodeBlock from '@theme/CodeBlock'; -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; - -import LxmlParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lxml_parser.py'; -import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lxml_saxonche_parser.py'; -import LexborParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/lexbor_parser.py'; -import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/pyquery_parser.py'; -import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/crawler_custom_parser/scrapling_parser.py'; - -import SelectolaxParserSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_parser.py'; -import SelectolaxContextSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_context.py'; -import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler.py'; -import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_crawler_run.py'; -import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/crawler_custom_parser/selectolax_adaptive_run.py'; - -Crawlee provides `BeautifulSoupCrawler` and `ParselCrawler` as built-in solutions for HTML parsing. However, you may want to use a different parsing library that better fits your specific needs. - -There are two approaches to integrate a custom parser: - -- **Using `HttpCrawler`** — Parse raw responses directly in request handlers. Quick to set up, but helpers like `enqueue_links` are not available. Best for simple scraping tasks or quick prototyping. -- **Creating a custom crawler** — Implement a crawler based on `AbstractHttpCrawler` for full framework integration. Best for reusable crawlers or when you need full integration and support in `AdaptivePlaywrightCrawler`. - -## Using HttpCrawler with custom parser - -The `HttpCrawler` gives you direct access to raw HTTP responses, allowing you to integrate any parsing library of your choice. When using this approach, helpers like `enqueue_links` and `extract_links` are not available, and it requires minimal setup. - -The following examples demonstrate integration with various parsing libraries: [lxml](https://lxml.de/) for high-performance XPath 1.0 parsing, [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) for XPath 3.1 support, [selectolax](https://github.com/rushter/selectolax) for fast CSS selector-based parsing, [PyQuery](https://pyquery.readthedocs.io/) for jQuery-like syntax, and [scrapling](https://github.com/D4Vinci/Scrapling) for CSS and XPath selectors with Scrapy/Parsel-like API and BeautifulSoup-style find methods. - - - - - {LxmlParser} - - - - - {LxmlSaxoncheParser} - - - - - {LexborParser} - - - - - {PyqueryParser} - - - - - {ScraplingParser} - - - - -## Creating a custom crawler - -For deeper integration with full access to framework helpers like `enqueue_links`, you can create a custom crawler based on `AbstractHttpCrawler`. This approach requires implementing three components — a parser, a crawling context, and the crawler class — but provides a seamless experience similar to built-in crawlers. - -The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. - -### Implementing the parser - -The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: - - - {SelectolaxParserSource} - - -### Defining the crawling context - -The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with: - - - {SelectolaxContextSource} - - -### Building the crawler - -The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: - - - {SelectolaxCrawlerSource} - - -### Using the crawler - -The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: - - - - - {SelectolaxCrawlerRunSource} - - - - - {AdaptiveCrawlerRunSource} - - - - - -## Conclusion - -Crawlee offers flexible options for integrating custom parsing libraries. Use `HttpCrawler` for quick integration when you need to parse responses with your preferred library. For full framework integration with helpers like `enqueue_links`, implement a custom crawler using `AbstractHttpCrawler`. Both approaches allow you to leverage any parser from the Python ecosystem while benefiting from Crawlee's request management, rate limiting, and data storage features. diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 3cd29ed314..47c14a4a58 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -8,11 +8,24 @@ import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/beautifulsoup_example.py'; import ParselExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/parsel_example.py'; import HttpExample from '!!raw-loader!roa-loader!./code_examples/http_crawlers/http_example.py'; +import LxmlParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_parser.py'; +import LxmlSaxoncheParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lxml_saxonche_parser.py'; +import LexborParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/lexbor_parser.py'; +import PyqueryParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/pyquery_parser.py'; +import ScraplingParser from '!!raw-loader!roa-loader!./code_examples/http_crawlers/scrapling_parser.py'; + +import SelectolaxParserSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_parser.py'; +import SelectolaxContextSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_context.py'; +import SelectolaxCrawlerSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler.py'; +import SelectolaxCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_crawler_run.py'; +import AdaptiveCrawlerRunSource from '!!raw-loader!./code_examples/http_crawlers/selectolax_adaptive_run.py'; + HTTP crawlers are ideal for extracting data from server-rendered websites that don't require JavaScript execution. These crawlers make requests via HTTP clients to fetch HTML content and then parse it using various parsing libraries. For client-side rendered content, where you need to execute JavaScript consider using [Playwright crawler](https://crawlee.dev/python/docs/guides/playwright-crawler) instead. ## Overview @@ -84,7 +97,41 @@ The `HttpCrawler` provides direct acce {HttpExample} -## Creating custom HTTP crawler +## Using HttpCrawler with a custom parser + +Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. + +The following examples demonstrate integration with various parsing libraries: [lxml](https://lxml.de/) for high-performance XPath 1.0 parsing, [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) for XPath 3.1 support, [selectolax](https://github.com/rushter/selectolax) for fast CSS selector-based parsing, [PyQuery](https://pyquery.readthedocs.io/) for jQuery-like syntax, and [scrapling](https://github.com/D4Vinci/Scrapling) for CSS and XPath selectors with Scrapy/Parsel-like API and BeautifulSoup-style find methods. + + + + + {LxmlParser} + + + + + {LxmlSaxoncheParser} + + + + + {LexborParser} + + + + + {PyqueryParser} + + + + + {ScraplingParser} + + + + +## Creating a custom HTTP crawler While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: @@ -94,8 +141,53 @@ While the built-in crawlers cover most use cases, you might need a custom HTTP c This approach is recommended when you need tight integration between parsing and the crawling context, or when you're building a reusable crawler for a specific technology or format. +The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. + +### Implementing the parser + +The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: + + + {SelectolaxParserSource} + + +This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. + +### Defining the crawling context + +The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. + + + {SelectolaxContextSource} + + +### Building the crawler + +The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: + + + {SelectolaxCrawlerSource} + + +### Using the crawler + +The custom crawler works like any built-in crawler. Request handlers receive your custom context with full access to framework helpers like `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: + + + + + {SelectolaxCrawlerRunSource} + + + + + {AdaptiveCrawlerRunSource} + + + + ## Conclusion -This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to create custom crawlers for specific use cases. +This guide provided a comprehensive overview of HTTP crawlers in Crawlee. You learned about the three main crawler types - `BeautifulSoupCrawler` for fault-tolerant HTML parsing, `ParselCrawler` for high-performance extraction with XPath and CSS selectors, and `HttpCrawler` for raw response processing. You also discovered how to integrate third-party parsing libraries with `HttpCrawler` and how to create fully custom crawlers using `AbstractHttpCrawler` for specialized parsing requirements. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! From 2bc596745286bf291d531d60121d8fd608c1e282 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:00 +0200 Subject: [PATCH 08/18] Update pyproject.toml Co-authored-by: Vlada Dusek --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 19795b833a..d6483165bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -257,7 +257,7 @@ module = [ "camoufox", # Example code shows integration of camoufox and crawlee. "fastapi", # Example code shows running in webserver. "saxonche", # Example code shows HttpCrawler with custom parser. - "scrapling.*", # Example code shows HttpCrawler with custom parser. + "scrapling.*", # Example code shows HttpCrawler with custom parser. "selectolax.*", # Example code shows HttpCrawler with custom parser. "stagehand.*", # Example code shows integration of Stagehand and crawlee. "starlette.*", # Example code shows running in webserver. From 2b1f41fb4f738d3a8f07b774d28485bd0e9df3b7 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:10 +0200 Subject: [PATCH 09/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 47c14a4a58..956edc2907 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -169,7 +169,7 @@ The crawler class connects the parser and context. Extend `enqueue_links`. Additionally, the custom parser can be used with `AdaptivePlaywrightCrawler` for adaptive crawling: From 08ee00c667427ce379e78ae05832620353a13860 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:29 +0200 Subject: [PATCH 10/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 956edc2907..2eefbffb2d 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -161,7 +161,7 @@ The crawling context is passed to request handlers and provides access to the pa {SelectolaxContextSource} -### Building the crawler +### Crawler composition The crawler class connects the parser and context. Extend `AbstractHttpCrawler` and configure the context pipeline to use your custom components: From 8195397b04a147838e93b1ca812b6ea1f4d2604a Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:37 +0200 Subject: [PATCH 11/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 2eefbffb2d..31644135f7 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -153,7 +153,7 @@ The parser converts HTTP responses into a parsed document and provides methods f This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. -### Defining the crawling context +### Crawling context definition The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. From a5be06d1cb43983c7365409ba3401443159bebc6 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:43 +0200 Subject: [PATCH 12/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 31644135f7..f72f27d11a 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -143,7 +143,7 @@ This approach is recommended when you need tight integration between parsing and The following example demonstrates how to create a custom crawler using `selectolax` with the `Lexbor` engine. -### Implementing the parser +### Parser implementation The parser converts HTTP responses into a parsed document and provides methods for element selection. Implement `AbstractHttpParser` using `selectolax` with required methods for parsing and querying: From e22346bfdaabe8e1187f09a7e4e60db83fd4bafc Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:53 +0200 Subject: [PATCH 13/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index f72f27d11a..a026bbbb0e 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -131,7 +131,7 @@ The following examples demonstrate integration with various parsing libraries: [ -## Creating a custom HTTP crawler +## Custom HTTP crawler While the built-in crawlers cover most use cases, you might need a custom HTTP crawler for specialized parsing requirements. To create a custom HTTP crawler, inherit directly from `AbstractHttpCrawler`. This approach requires implementing: From 3cfacf0db9d3819d9ba3a0c1f70a01dfd17e1836 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:20:59 +0200 Subject: [PATCH 14/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index a026bbbb0e..f318e6234d 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -97,7 +97,7 @@ The `HttpCrawler` provides direct acce {HttpExample} -## Using HttpCrawler with a custom parser +### Using custom parsers Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. From ac427cc0d8adc46261f9a10572c4957c59cb4fb0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:21:04 +0200 Subject: [PATCH 15/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Vlada Dusek --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index f318e6234d..73c36b9c9c 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -101,7 +101,7 @@ The `HttpCrawler` provides direct acce Since `HttpCrawler` provides raw HTTP responses, you can integrate any parsing library. Note that helpers like `enqueue_links` and `extract_links` are not available with this approach. -The following examples demonstrate integration with various parsing libraries: [lxml](https://lxml.de/) for high-performance XPath 1.0 parsing, [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) for XPath 3.1 support, [selectolax](https://github.com/rushter/selectolax) for fast CSS selector-based parsing, [PyQuery](https://pyquery.readthedocs.io/) for jQuery-like syntax, and [scrapling](https://github.com/D4Vinci/Scrapling) for CSS and XPath selectors with Scrapy/Parsel-like API and BeautifulSoup-style find methods. +The following examples demonstrate how to integrate with several popular parsing libraries, including [lxml](https://lxml.de/) (high-performance parsing with XPath 1.0), [lxml with SaxonC-HE](https://pypi.org/project/saxonche/) (XPath 3.1 support), [selectolax](https://github.com/rushter/selectolax) (high-speed CSS selectors), [PyQuery](https://pyquery.readthedocs.io/) (jQuery-like syntax), and [scrapling](https://github.com/D4Vinci/Scrapling) (a Scrapy/Parsel-style API offering BeautifulSoup-like methods). From 24ca257e4c90a9d24beb07e362bcb63426a7d8fd Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 23 Dec 2025 13:21:02 +0000 Subject: [PATCH 16/18] replace match to item --- docs/guides/code_examples/http_crawlers/selectolax_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py index 2e4f5f0f6f..1627a3b220 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_parser.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py @@ -34,7 +34,7 @@ async def select( self, parsed_content: LexborHTMLParser, selector: str ) -> Sequence[LexborNode]: """Select elements matching a CSS selector.""" - return tuple(match for match in parsed_content.css(selector)) + return tuple(item for item in parsed_content.css(selector)) @override def is_matching_selector( From e4a20b0fd2fae8d0226bd0d1505994606d9fc3fd Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Mon, 5 Jan 2026 21:21:25 +0200 Subject: [PATCH 17/18] Update docs/guides/http_crawlers.mdx Co-authored-by: Jan Buchar --- docs/guides/http_crawlers.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/http_crawlers.mdx b/docs/guides/http_crawlers.mdx index 73c36b9c9c..366b36127c 100644 --- a/docs/guides/http_crawlers.mdx +++ b/docs/guides/http_crawlers.mdx @@ -153,7 +153,7 @@ The parser converts HTTP responses into a parsed document and provides methods f This is enough to use your parser with `AbstractHttpCrawler.create_parsed_http_crawler_class` factory method. For more control, continue with custom context and crawler classes below. -### Crawling context definition +### Crawling context definition (optional) The crawling context is passed to request handlers and provides access to the parsed content. Extend `ParsedHttpCrawlingContext` to define the interface your handlers will work with. Here you can implement additional helpers for the crawler context. From a4ccbdec351c90360dd7553c96d89956da703d84 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 5 Jan 2026 22:55:13 +0000 Subject: [PATCH 18/18] add additional comments --- .../guides/code_examples/http_crawlers/selectolax_context.py | 2 ++ .../guides/code_examples/http_crawlers/selectolax_crawler.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_context.py b/docs/guides/code_examples/http_crawlers/selectolax_context.py index 8fe36c039c..3a34e20d8d 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_context.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_context.py @@ -6,6 +6,8 @@ from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext +# Custom context for Selectolax parser, you can add your own methods here +# to facilitate working with the parsed document. @dataclass(frozen=True) class SelectolaxLexborContext(ParsedHttpCrawlingContext[LexborHTMLParser]): """Crawling context providing access to the parsed page. diff --git a/docs/guides/code_examples/http_crawlers/selectolax_crawler.py b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py index d5efc466e6..677a6a3b00 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_crawler.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_crawler.py @@ -17,6 +17,9 @@ from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext +# Custom crawler using custom context, It is optional and you can use +# AbstractHttpCrawler directly with SelectolaxLexborParser if you don't need +# any custom context methods. class SelectolaxLexborCrawler( AbstractHttpCrawler[SelectolaxLexborContext, LexborHTMLParser, LexborNode] ): @@ -30,6 +33,8 @@ def __init__( async def final_step( context: ParsedHttpCrawlingContext[LexborHTMLParser], ) -> AsyncGenerator[SelectolaxLexborContext, None]: + # Yield custom context wrapping with additional functionality around the base + # context. yield SelectolaxLexborContext.from_parsed_http_crawling_context(context) # Build context pipeline: HTTP request -> parsing -> custom context.