Export entire dataset to file
This example demonstrates how to use the BasicCrawler.export_data method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format.
note
For these examples, we are using the BeautifulSoupCrawler. However, the same method is available for other crawlers as well. You can use it in exactly the same way.
- JSON
- CSV
Run on
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
# Export the entire dataset to a JSON file.
await crawler.export_data(path='results.json')
if __name__ == '__main__':
asyncio.run(main())
Run on
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
# Export the entire dataset to a CSV file.
await crawler.export_data(path='results.csv')
if __name__ == '__main__':
asyncio.run(main())