Crawl website with relative links

When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the enqueue_links method on the crawler context, which will automatically find and add these links to the crawler's RequestQueue. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.

note

For these examples, we are using the BeautifulSoupCrawler. However, the same method is available for other crawlers as well. You can use it in exactly the same way.

EnqueueStrategy type alias provides four distinct strategies for crawling relative links:

all - Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.
same-domain - Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.
same-hostname - Enqueues all links found for the exact same hostname. This is the default strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.
same-origin - Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.

All links
Same domain
Same hostname
Same origin

Run on

import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Enqueue all links found on the page. Any URLs found will be matched by
        # this strategy, even if they go off the site you are currently crawling.
        await context.enqueue_links(strategy='all')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Run on

import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same domain will enqueue all links found that
        # are on the same hostname as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-domain')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Run on

import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same hostname will enqueue all links found that are on
        # the same hostname (including subdomains) as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-hostname')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())

Run on

import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
    crawler = BeautifulSoupCrawler(
        # Limit the crawl to max requests. Remove or increase it for crawling all links.
        max_requests_per_crawl=10,
    )

    # Define the default request handler, which will be called for every request.
    @crawler.router.default_handler
    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
        context.log.info(f'Processing {context.request.url} ...')

        # Setting the strategy to same origin will enqueue all links found that are on
        # the same origin as request.loaded_url or request.url.
        await context.enqueue_links(strategy='same-origin')

    # Run the crawler with the initial list of requests.
    await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
    asyncio.run(main())