Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import csv

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

Expand Down Expand Up @@ -31,7 +32,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

# Export the entire dataset to a CSV file.
# Use semicolon as delimiter and always quote strings.
await crawler.export_data(path='results.csv', delimiter=';', quoting='all')
await crawler.export_data(path='results.csv', delimiter=';', quoting=csv.QUOTE_ALL)


if __name__ == '__main__':
Expand Down
11 changes: 10 additions & 1 deletion src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ class ExportDataCsvKwargs(TypedDict):
"""A one-character string used to quote fields containing special characters, like the delimiter or quotechar,
or fields containing new-line characters. Defaults to '\"'."""

quoting: NotRequired[int]
quoting: NotRequired[Literal[0, 1, 2, 3]]
"""Controls when quotes should be generated by the writer and recognized by the reader. Can take any of
the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`."""

Expand All @@ -822,3 +822,12 @@ class ExportDataCsvKwargs(TypedDict):

strict: NotRequired[bool]
"""When True, raises an exception on bad CSV input. Defaults to False."""


class ExportDataKwargs(ExportDataJsonKwargs, ExportDataCsvKwargs):
"""Keyword arguments accepted by `BasicCrawler.export_data`.

Combines all `ExportDataJsonKwargs` and `ExportDataCsvKwargs` fields, since the export format is
determined dynamically from the file extension at call time. Only the kwargs relevant to the selected
format are forwarded to the underlying exporter.
"""
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ async def from_static_pipeline_to_top_router(
)
await self.router(adaptive_crawling_context)

return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)

if rendering_type == 'client only':

Expand All @@ -362,7 +362,7 @@ async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) ->
)
await self.router(adaptive_crawling_context)

return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)

raise RuntimeError(
f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
EnqueueLinksKwargs,
ExportDataCsvKwargs,
ExportDataJsonKwargs,
ExportDataKwargs,
GetKeyValueStoreFromRequestHandlerFunction,
HttpHeaders,
HttpPayload,
Expand Down Expand Up @@ -896,7 +897,7 @@ async def export_data(
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
**additional_kwargs: Unpack[ExportDataKwargs],
) -> None:
"""Export all items from a Dataset to a JSON or CSV file.

Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import warnings
from datetime import timedelta
from functools import partial
from typing import TYPE_CHECKING, Any, Generic, Literal
from typing import TYPE_CHECKING, Any, Generic, Literal, cast

import playwright.async_api
from more_itertools import partition
Expand All @@ -32,7 +32,7 @@
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
from ._types import GotoOptions
from ._types import BlockRequestsFunction, GotoOptions
from ._utils import block_requests, infinite_scroll

TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
Expand Down Expand Up @@ -251,7 +251,7 @@ async def _open_page(
log=context.log,
register_deferred_cleanup=context.register_deferred_cleanup,
page=crawlee_page.page,
block_requests=partial(block_requests, page=crawlee_page.page),
block_requests=cast('BlockRequestsFunction', partial(block_requests, page=crawlee_page.page)),
goto_options=GotoOptions(**self._goto_options),
)

Expand Down Expand Up @@ -535,7 +535,7 @@ async def _create_crawling_context(
infinite_scroll=lambda: infinite_scroll(context.page),
extract_links=extract_links,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
block_requests=cast('BlockRequestsFunction', partial(block_requests, page=context.page)),
)

if context.session:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler_statistics import (
AdaptivePlaywrightCrawlerStatisticState,
)
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
AdaptiveContextError,
)
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import AdaptiveContextError
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics
from crawlee.storage_clients import SqlStorageClient
Expand Down Expand Up @@ -87,7 +85,8 @@ def __init__(
) -> None:
super().__init__()

self._rendering_types = rendering_types or cycle(['static'])
default_rendering_types: list[RenderingType] = ['static']
self._rendering_types = rendering_types or cycle(default_rendering_types)
Comment thread
vdusek marked this conversation as resolved.
self._detection_probability_recommendation = detection_probability_recommendation or cycle([1])

@override
Expand Down
Loading
Loading