Export entire dataset to file
This example demonstrates how to use the BasicCrawler.export_data method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying json.dump or csv.writer behavior.
note
For these examples, we are using the BeautifulSoupCrawler. However, the same method is available for other crawlers as well. You can use it in exactly the same way.
- JSON
- CSV
Run on
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
# Export the entire dataset to a JSON file.
# Set ensure_ascii=False to allow Unicode characters in the output.
await crawler.export_data(path='results.json', ensure_ascii=False)
if __name__ == '__main__':
asyncio.run(main())
Run on
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
# Export the entire dataset to a CSV file.
# Use semicolon as delimiter and always quote strings.
await crawler.export_data(path='results.csv', delimiter=';', quoting='all')
if __name__ == '__main__':
asyncio.run(main())