Crawl all links on website
This example uses the enqueue_links
helper to add new links to the RequestQueue
as the crawler navigates from page to page. By automatically discovering and enqueuing all links on a given page, the crawler can systematically scrape an entire website. This approach is ideal for web scraping tasks where you need to collect data from multiple interconnected pages.
tip
If no options are given, by default the method will only add links that are under the same subdomain. This behavior can be controlled with the strategy
option, which is an instance of the EnqueueStrategy
enum. You can find more info about this option in the Crawl website with relative links example.
- BeautifulSoupCrawler
- PlaywrightCrawler
import asyncio
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all links found on the page.
await context.enqueue_links()
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all links found on the page.
await context.enqueue_links()
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())