Crawl website with relative links
When crawling a website, you may encounter various types of links that you wish to include in your crawl. To facilitate this, we provide the enqueue_links
method on the crawler context, which will automatically find and add these links to the crawler's RequestQueue
. This method simplifies the process of handling different types of links, including relative links, by automatically resolving them based on the page's context.
note
For these examples, we are using the BeautifulSoupCrawler
. However, the same method is available for other crawlers as well. You can use it in exactly the same way.
EnqueueStrategy
enum provides four distinct strategies for crawling relative links:
EnqueueStrategy.All
- Enqueues all links found, regardless of the domain they point to. This strategy is useful when you want to follow every link, including those that navigate to external websites.EnqueueStrategy.SAME_DOMAIN
- Enqueues all links found that share the same domain name, including any possible subdomains. This strategy ensures that all links within the same top-level and base domain are included.EnqueueStrategy.SAME_HOSTNAME
- Enqueues all links found for the exact same hostname. This is the default strategy, and it restricts the crawl to links that have the same hostname as the current page, excluding subdomains.EnqueueStrategy.SAME_ORIGIN
- Enqueues all links found that share the same origin. The same origin refers to URLs that share the same protocol, domain, and port, ensuring a strict scope for the crawl.
- All links
- Same domain
- Same hostname
- Same origin
import asyncio
from crawlee import EnqueueStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all links found on the page. Any URLs found will be matched by this strategy,
# even if they go off the site you are currently crawling.
await context.enqueue_links(strategy=EnqueueStrategy.ALL)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee import EnqueueStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Setting the strategy to SAME_DOMAIN will enqueue all links found that
# are on the same hostname as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_DOMAIN)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee import EnqueueStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Setting the strategy to SAME_HOSTNAME will enqueue all links found that are on
# the same hostname (including subdomains) as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_HOSTNAME)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee import EnqueueStrategy
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Setting the strategy to SAME_ORIGIN will enqueue all links found that are on
# the same origin as request.loaded_url or request.url.
await context.enqueue_links(strategy=EnqueueStrategy.SAME_ORIGIN)
# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])
if __name__ == '__main__':
asyncio.run(main())