AdaptivePlaywrightCrawler
This example demonstrates how to use AdaptivePlaywrightCrawler
. An AdaptivePlaywrightCrawler
is a combination of PlaywrightCrawler
and some implementation of HTTP-based crawler such as ParselCrawler
or BeautifulSoupCrawler
.
It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects that it may bring a performance benefit.
A pre-navigation hook can be used to perform actions before navigating to the URL. This hook provides further flexibility in controlling environment and preparing for navigation. Hooks will be executed both for the pages crawled by HTTP-bases sub crawler and playwright based sub crawler. Use playwright_only=True
to mark hooks that should be executed only for playwright sub crawler.
For more detailed description please see AdaptivePlaywrightCrawler guide
import asyncio
from datetime import timedelta
from playwright.async_api import Route
from crawlee.crawlers import (
AdaptivePlaywrightCrawler,
AdaptivePlaywrightCrawlingContext,
AdaptivePlaywrightPreNavCrawlingContext,
)
async def main() -> None:
# Crawler created by following factory method will use `beautifulsoup`
# for parsing static content.
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
)
@crawler.router.default_handler
async def request_handler_for_label(
context: AdaptivePlaywrightCrawlingContext,
) -> None:
# Do some processing using `parsed_content`
context.log.info(context.parsed_content.title)
# Locate element h2 within 5 seconds
h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
# Do stuff with element found by the selector
context.log.info(h2)
# Find more links and enqueue them.
await context.enqueue_links()
# Save some data.
await context.push_data({'Visited url': context.request.url})
@crawler.pre_navigation_hook
async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
"""Hook executed both in static sub crawler and playwright sub crawler.
Trying to access `context.page` in this hook would raise `AdaptiveContextError`
for pages crawled without playwright."""
context.log.info(f'pre navigation hook for: {context.request.url} ...')
@crawler.pre_navigation_hook(playwright_only=True)
async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
"""Hook executed only in playwright sub crawler.
It is safe to access `page` object.
"""
async def some_routing_function(route: Route) -> None:
await route.continue_()
await context.page.route('*/**', some_routing_function)
context.log.info(
f'Playwright only pre navigation hook for: {context.request.url} ...'
)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
if __name__ == '__main__':
asyncio.run(main())