Skip to main content

Run parallel crawlers

This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.

In some situations, you may need different approaches for scraping data from a website. For example, you might use PlaywrightCrawler for navigating JavaScript-heavy pages and a faster, more lightweight ParselCrawler for processing static pages. One way to solve this is to use AdaptivePlaywrightCrawler, see the Adaptive Playwright crawler example to learn more.

The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via RequestQueue aliases. The keep_alive option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this guide.

Run on
import asyncio

from crawlee import ConcurrencySettings
from crawlee.crawlers import (
ParselCrawler,
ParselCrawlingContext,
PlaywrightCrawler,
PlaywrightCrawlingContext,
)
from crawlee.sessions import SessionPool
from crawlee.storages import RequestQueue


async def main() -> None:
# Open request queues for both crawlers with different aliases
playwright_rq = await RequestQueue.open(alias='playwright-requests')
parsel_rq = await RequestQueue.open(alias='parsel-requests')

# Use a shared session pool between both crawlers
async with SessionPool() as session_pool:
playwright_crawler = PlaywrightCrawler(
# Set the request queue for Playwright crawler
request_manager=playwright_rq,
session_pool=session_pool,
# Configure concurrency settings for Playwright crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=5, desired_concurrency=5
),
# Set `keep_alive`` so that the crawler does not stop working when there are
# no requests in the queue.
keep_alive=True,
)

parsel_crawler = ParselCrawler(
# Set the request queue for Parsel crawler
request_manager=parsel_rq,
session_pool=session_pool,
# Configure concurrency settings for Parsel crawler
concurrency_settings=ConcurrencySettings(
max_concurrency=10, desired_concurrency=10
),
# Set maximum requests per crawl for Parsel crawler
max_requests_per_crawl=50,
)

@playwright_crawler.router.default_handler
async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Playwright Processing {context.request.url}...')

title = await context.page.title()
# Push the extracted data to the dataset for Playwright crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'playwright'},
dataset_name='playwright-data',
)

@parsel_crawler.router.default_handler
async def handle_parsel(context: ParselCrawlingContext) -> None:
context.log.info(f'Parsel Processing {context.request.url}...')

title = context.parsed_content.css('title::text').get()
# Push the extracted data to the dataset for Parsel crawler
await context.push_data(
{'title': title, 'url': context.request.url, 'source': 'parsel'},
dataset_name='parsel-data',
)

# Enqueue links to the Playwright request queue for blog pages
await context.enqueue_links(
selector='a[href*="/blog/"]', rq_alias='playwright-requests'
)
# Enqueue other links to the Parsel request queue
await context.enqueue_links(selector='a:not([href*="/blog/"])')

# Start the Playwright crawler in the background
background_crawler_task = asyncio.create_task(playwright_crawler.run([]))

# Run the Parsel crawler with the initial URL and wait for it to finish
await parsel_crawler.run(['https://crawlee.dev/blog'])

# Wait for the Playwright crawler to finish processing all requests
while not await playwright_rq.is_empty():
playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
await asyncio.sleep(5)

# Stop the Playwright crawler after all requests are processed
playwright_crawler.stop()

# Wait for the background Playwright crawler task to complete
await background_crawler_task


if __name__ == '__main__':
asyncio.run(main())