Add data to dataset
This example demonstrates how to store extracted data into datasets using the context.push_data
helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing dataset_id
or dataset_name
parameters to the push_data
function.
- BeautifulSoupCrawler
- PlaywrightCrawler
import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler()
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'html': str(context.soup)[:1000],
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of requests.
await crawler.run(
[
'https://crawlee.dev',
'https://apify.com',
'https://example.com',
]
)
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main() -> None:
crawler = PlaywrightCrawler()
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page.
data = {
'url': context.request.url,
'title': await context.page.title(),
'html': str(await context.page.content())[:1000],
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of requests.
await crawler.run(
[
'https://crawlee.dev',
'https://apify.com',
'https://example.com',
]
)
if __name__ == '__main__':
asyncio.run(main())
Each item in the dataset will be stored in its own file within the following directory:
{PROJECT_FOLDER}/storage/datasets/default/
For more control, you can also open a dataset manually using the asynchronous constructor Dataset.open
from crawlee.storages import Dataset
async def main() -> None:
# Open dataset manually using asynchronous constructor open().
dataset = await Dataset.open()
# Interact with dataset directly.
await dataset.push_data({'key': 'value'})