Keep a Crawler alive waiting for more requests

This example demonstrates how to keep crawler alive even when there are no requests at the moment by using keep_alive=True argument of BasicCrawler.__init__. This is available to all crawlers that inherit from BasicCrawler and in the example below it is shown on BeautifulSoupCrawler. To stop the crawler that was started with keep_alive=True you can call crawler.stop().

Run on

import asyncio

from crawlee._types import BasicCrawlingContext
from crawlee.crawlers import BeautifulSoupCrawler


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Keep the crawler alive even when there are no requests to be processed now.
        keep_alive=True,
    )

    def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
        """Stop crawler once specific url is visited.

        Example of guard condition to stop the crawler."""
        if context.request.url == 'https://crawlee.dev/docs/examples':
            crawler.stop(
                'Stop crawler that was in keep_alive state after specific url was visite'
            )
        else:
            context.log.info('keep_alive=True, waiting for more requests to come.')

    async def add_request_later(url: str, after_s: int) -> None:
        """Add requests to the queue after some time. Can be done by external code."""
        # Just an example of request being added to the crawler later,
        # when it is waiting due to `keep_alive=True`.
        await asyncio.sleep(after_s)
        await crawler.add_requests([url])

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BasicCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Stop crawler if some guard condition has been met.
        stop_crawler_if_url_visited(context)

    # Start some tasks that will add some requests later to simulate real situation,
    # where requests are added later by external code.
    add_request_later_task1 = asyncio.create_task(
        add_request_later(url='https://crawlee.dev', after_s=1)
    )
    add_request_later_task2 = asyncio.create_task(
        add_request_later(url='https://crawlee.dev/docs/examples', after_s=5)
    )

    # Run the crawler without the initial list of requests.
    # Wait for more requests to be added to the queue later due to `keep_alive=True`.
    await crawler.run()

    await asyncio.gather(add_request_later_task1, add_request_later_task2)


if __name__ == '__main__':
    asyncio.run(main())