Playwright crawler with fingerprint generator
This example demonstrates how to use PlaywrightCrawler
together with FingerprintGenerator
that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting.
You can implement your own fingerprint generator or use DefaultFingerprintGenerator
. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it.
import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import (
DefaultFingerprintGenerator,
HeaderGeneratorOptions,
ScreenOptions,
)
async def main() -> None:
# Use default fingerprint generator with desired fingerprint options.
# Generator will generate real looking browser fingerprint based on the options.
# Unspecified fingerprint options will be automatically selected by the generator.
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
screen_options=ScreenOptions(min_width=400),
)
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
# Headless mode, set to False to see the browser in action.
headless=False,
# Browser types supported by Playwright.
browser_type='chromium',
# Fingerprint generator to be used. By default no fingerprint generation is done.
fingerprint_generator=fingerprint_generator,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')
# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])
if __name__ == '__main__':
asyncio.run(main())