Playwright crawler with block requests
This example demonstrates how to optimize your PlaywrightCrawler
performance by blocking unnecessary network requests.
The primary use case is when you need to scrape or interact with web pages without loading non-essential resources like images, styles, or analytics scripts. This can significantly reduce bandwidth usage and improve crawling speed.
The block_requests
helper provides the most efficient way to block requests as it operates directly in the browser.
By default, block_requests
will block all URLs including the following patterns:
['.css', '.webp', '.jpg', '.jpeg', '.png', '.svg', '.gif', '.woff', '.pdf', '.zip']
You can also replace the default patterns list with your own by providing url_patterns
, or extend it by passing additional patterns in extra_url_patterns
.
import asyncio
from crawlee.crawlers import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
PlaywrightPreNavCrawlingContext,
)
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
await context.enqueue_links()
# Define the hook, which will be called before every request.
@crawler.pre_navigation_hook
async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
context.log.info(f'Navigating to {context.request.url} ...')
# Block all requests to URLs that include `adsbygoogle.js` and also all defaults.
await context.block_requests(extra_url_patterns=['adsbygoogle.js'])
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev/'])
if __name__ == '__main__':
asyncio.run(main())