|
26 | 26 | from crawlee.configuration import Configuration |
27 | 27 | from crawlee.crawlers import BasicCrawler |
28 | 28 | from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError |
29 | | -from crawlee.events import Event, EventCrawlerStatusData |
30 | | -from crawlee.events._local_event_manager import LocalEventManager |
| 29 | +from crawlee.events import Event, EventCrawlerStatusData, LocalEventManager |
31 | 30 | from crawlee.request_loaders import RequestList, RequestManagerTandem |
32 | 31 | from crawlee.sessions import Session, SessionPool |
33 | 32 | from crawlee.statistics import FinalStatistics |
@@ -2047,3 +2046,104 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> Requ |
2047 | 2046 | assert error_request is not None |
2048 | 2047 | assert error_request.state == RequestState.DONE |
2049 | 2048 | assert error_request.was_already_handled |
| 2049 | + |
| 2050 | + |
| 2051 | +@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.Barrier was introduced in Python 3.11.') |
| 2052 | +async def test_multiple_crawlers_with_global_event_manager() -> None: |
| 2053 | + """Test that multiple crawlers work correctly when using the global event manager.""" |
| 2054 | + |
| 2055 | + rq1 = await RequestQueue.open(alias='rq1') |
| 2056 | + rq2 = await RequestQueue.open(alias='rq2') |
| 2057 | + |
| 2058 | + crawler_1 = BasicCrawler(request_manager=rq1) |
| 2059 | + crawler_2 = BasicCrawler(request_manager=rq2) |
| 2060 | + |
| 2061 | + started_event = asyncio.Event() |
| 2062 | + finished_event = asyncio.Event() |
| 2063 | + |
| 2064 | + async def launch_crawler_1() -> None: |
| 2065 | + await crawler_1.run(['https://a.placeholder.com']) |
| 2066 | + finished_event.set() |
| 2067 | + |
| 2068 | + async def launch_crawler_2() -> None: |
| 2069 | + # Ensure that crawler_1 is already running and has activated event_manager |
| 2070 | + await started_event.wait() |
| 2071 | + await crawler_2.run(['https://b.placeholder.com']) |
| 2072 | + |
| 2073 | + handler_barrier = asyncio.Barrier(2) # ty:ignore[unresolved-attribute] # Test is skipped in older Python versions. |
| 2074 | + |
| 2075 | + handler_call = AsyncMock() |
| 2076 | + |
| 2077 | + @crawler_1.router.default_handler |
| 2078 | + async def handler_1(context: BasicCrawlingContext) -> None: |
| 2079 | + started_event.set() |
| 2080 | + # Ensure that both handlers are running at the same time. |
| 2081 | + await handler_barrier.wait() |
| 2082 | + event_manager = service_locator.get_event_manager() |
| 2083 | + |
| 2084 | + await handler_call(event_manager.active) |
| 2085 | + |
| 2086 | + @crawler_2.router.default_handler |
| 2087 | + async def handler_2(context: BasicCrawlingContext) -> None: |
| 2088 | + # Ensure that both handlers are running at the same time. |
| 2089 | + await handler_barrier.wait() |
| 2090 | + # Ensure that crawler_1 is finished and closed all active contexts. |
| 2091 | + await finished_event.wait() |
| 2092 | + # Check that event manager is active and can be used in the second crawler. |
| 2093 | + event_manager = service_locator.get_event_manager() |
| 2094 | + |
| 2095 | + await handler_call(event_manager.active) |
| 2096 | + |
| 2097 | + await asyncio.gather( |
| 2098 | + launch_crawler_1(), |
| 2099 | + launch_crawler_2(), |
| 2100 | + ) |
| 2101 | + |
| 2102 | + assert handler_call.call_count == 2 |
| 2103 | + |
| 2104 | + first_call = handler_call.call_args_list[0] |
| 2105 | + second_call = handler_call.call_args_list[1] |
| 2106 | + |
| 2107 | + assert first_call[0][0] is True |
| 2108 | + assert second_call[0][0] is True |
| 2109 | + |
| 2110 | + event_manager = service_locator.get_event_manager() |
| 2111 | + |
| 2112 | + # After both crawlers are finished, event manager should be inactive. |
| 2113 | + assert event_manager.active is False |
| 2114 | + |
| 2115 | + await rq1.drop() |
| 2116 | + await rq2.drop() |
| 2117 | + |
| 2118 | + |
| 2119 | +async def test_global_and_local_event_manager_in_crawler_run() -> None: |
| 2120 | + """Test that both global and local event managers are used in crawler run.""" |
| 2121 | + |
| 2122 | + config = service_locator.get_configuration() |
| 2123 | + |
| 2124 | + local_event_manager = LocalEventManager.from_config(config) |
| 2125 | + |
| 2126 | + crawler = BasicCrawler(event_manager=local_event_manager) |
| 2127 | + |
| 2128 | + handler_call = AsyncMock() |
| 2129 | + |
| 2130 | + @crawler.router.default_handler |
| 2131 | + async def handler(context: BasicCrawlingContext) -> None: |
| 2132 | + global_event_manager = service_locator.get_event_manager() |
| 2133 | + handler_call(local_event_manager.active, global_event_manager.active) |
| 2134 | + |
| 2135 | + await crawler.run(['https://a.placeholder.com']) |
| 2136 | + |
| 2137 | + assert handler_call.call_count == 1 |
| 2138 | + |
| 2139 | + local_em_state, global_em_state = handler_call.call_args_list[0][0] |
| 2140 | + |
| 2141 | + # Both event managers should be active. |
| 2142 | + assert local_em_state is True |
| 2143 | + assert global_em_state is True |
| 2144 | + |
| 2145 | + global_event_manager = service_locator.get_event_manager() |
| 2146 | + |
| 2147 | + # After crawler is finished, both event managers should be inactive. |
| 2148 | + assert local_event_manager.active is False |
| 2149 | + assert global_event_manager.active is False |
0 commit comments