BeautifulSoup crawler
This example demonstrates how to use BeautifulSoupCrawler
to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the BeautifulSoup library and extract some data from it - the page title and all <h1>
, <h2>
and <h3>
tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
import asyncio
from datetime import timedelta
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
# Create an instance of the BeautifulSoupCrawler class, a crawler that automatically
# loads the URLs and parses their HTML using the BeautifulSoup library.
crawler = BeautifulSoupCrawler(
# On error, retry each page at most once.
max_request_retries=1,
# Increase the timeout for processing each page to 30 seconds.
request_handler_timeout=timedelta(seconds=30),
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
# The handler receives a context parameter, providing various properties and
# helper methods. Here are a few key ones we use for demonstration:
# - request: an instance of the Request class containing details such as the URL
# being crawled and the HTTP method used.
# - soup: the BeautifulSoup object containing the parsed HTML of the response.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
}
# Push the extracted data to the default dataset. In local configuration,
# the data will be stored as JSON files in ./storage/datasets/default.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())