Skip to main content

Crawl specific links on website

This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the enqueue_links helper, you can pass include or exclude parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the RequestQueue. Both include and exclude support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.

Run on
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Enqueue all the documentation links found on the page, except for the examples.
await context.enqueue_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())

enqueue_links is a convenience helper and internally it calls extract_links to find the links and add_requests to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using extract_links and add_requests instead of the enqueue_links

Run on
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract all the documentation links found on the page, except for the examples.
extracted_links = await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Some very custom filtering which can't be achieved by `extract_links` arguments.
max_link_length = 30
filtered_links = [
link for link in extracted_links if len(link.url) < max_link_length
]
# Add filtered links to the request queue.
await context.add_requests(filtered_links)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())