Crawl specific links on website

This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the enqueue_links helper, you can pass include or exclude parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the RequestQueue. Both include and exclude support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.

BeautifulSoupCrawler
PlaywrightCrawler

Run on

import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all the documentation links found on the page, except for the examples.
        await context.enqueue_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Run on

import asyncio

from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all the documentation links found on the page, except for the examples.
        await context.enqueue_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Even more control over the enqueued links

enqueue_links is a convenience helper and internally it calls extract_links to find the links and add_requests to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using extract_links and add_requests instead of the enqueue_links

BeautifulSoupCrawler
PlaywrightCrawler

Run on

import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract all the documentation links found on the page, except for the examples.
        extracted_links = await context.extract_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )
        # Some very custom filtering which can't be achieved by `extract_links` arguments.
        max_link_length = 30
        filtered_links = [
            link for link in extracted_links if len(link.url) < max_link_length
        ]
        # Add filtered links to the request queue.
        await context.add_requests(filtered_links)

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Run on

import asyncio

from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
    crawler = PlaywrightCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Extract all the documentation links found on the page, except for the examples.
        extracted_links = await context.extract_links(
            include=[Glob('https://crawlee.dev/docs/**')],
            exclude=[Glob('https://crawlee.dev/docs/examples')],
        )
        # Some very custom filtering which can't be achieved by `extract_links` arguments.
        max_link_length = 30
        filtered_links = [
            link for link in extracted_links if len(link.url) < max_link_length
        ]
        # Add filtered links to the request queue.
        await context.add_requests(filtered_links)

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Even more control over the enqueued links​

Even more control over the enqueued links