Capturing page snapshots with ErrorSnapshotter
This example demonstrates how to capture page snapshots on first occurrence of each unique error. The capturing happens automatically if you set save_error_snapshots=True
in the crawler's Statistics
. The error snapshot can contain html
file and jpeg
file that are created from the page where the unhandled exception was raised. Captured error snapshot files are saved to the default key-value store. Both PlaywrightCrawler
and HTTP crawlers are capable of capturing the html file, but only PlaywrightCrawler
is able to capture page screenshot as well.
- ParselCrawler
- PlaywrightCrawler
Run on
import asyncio
from random import choice
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.statistics import Statistics
async def main() -> None:
crawler = ParselCrawler(
statistics=Statistics.with_default_state(save_error_snapshots=True)
)
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Simulate various errors to demonstrate `ErrorSnapshotter`
# saving only the first occurrence of unique error.
await context.enqueue_links()
random_number = choice(range(10))
if random_number == 1:
raise KeyError('Some KeyError')
if random_number == 2:
raise ValueError('Some ValueError')
if random_number == 3:
raise RuntimeError('Some RuntimeError')
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
Run on
import asyncio
from random import choice
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.statistics import Statistics
async def main() -> None:
crawler = PlaywrightCrawler(
statistics=Statistics.with_default_state(save_error_snapshots=True)
)
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Simulate various errors to demonstrate `ErrorSnapshotter`
# saving only the first occurrence of unique error.
await context.enqueue_links()
random_number = choice(range(10))
if random_number == 1:
raise KeyError('Some KeyError')
if random_number == 2:
raise ValueError('Some ValueError')
if random_number == 3:
raise RuntimeError('Some RuntimeError')
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())