Crawl specific links on website
This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the enqueue_links
helper, you can pass include
or exclude
parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the RequestQueue
. Both include
and exclude
support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.
- BeautifulSoupCrawler
- PlaywrightCrawler
import asyncio
from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all the documentation links found on the page, except for the examples.
await context.enqueue_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all the documentation links found on the page, except for the examples.
await context.enqueue_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
Even more control over the enqueued links
enqueue_links
is a convenience helper and internally it calls extract_links
to find the links and add_requests
to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using extract_links
and add_requests
instead of the enqueue_links
- BeautifulSoupCrawler
- PlaywrightCrawler
import asyncio
from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract all the documentation links found on the page, except for the examples.
extracted_links = await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Some very custom filtering which can't be achieved by `extract_links` arguments.
max_link_length = 30
filtered_links = [
link for link in extracted_links if len(link.url) < max_link_length
]
# Add filtered links to the request queue.
await context.add_requests(filtered_links)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract all the documentation links found on the page, except for the examples.
extracted_links = await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Some very custom filtering which can't be achieved by `extract_links` arguments.
max_link_length = 30
filtered_links = [
link for link in extracted_links if len(link.url) < max_link_length
]
# Add filtered links to the request queue.
await context.add_requests(filtered_links)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())