Version: 3.11

RobotsFile

Loads and queries information from a robots.txt file.

Example usage:

// Load the robots.txt file
const robots = await RobotsFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');

// Check if a URL should be crawled according to robots.txt
const url = 'https://crawlee.dev/js/api/puppeteer-crawler/class/PuppeteerCrawler';
if (robots.isAllowed(url)) {
  await crawler.addRequests([url]);
}

// Enqueue all links in the sitemap(s)
await crawler.addRequests(await robots.parseUrlsFromSitemaps());

Index

Methods

getSitemaps
isAllowed
parseSitemaps
parseUrlsFromSitemaps
find
from

Methods

getSitemaps

getSitemaps(): string[]

Get URLs of sitemaps referenced in the robots file.
Returns string[]

isAllowed

isAllowed(url, userAgent): boolean

Check if a URL should be crawled by robots.
Parameters
- url: string
  the URL to check against the rules in robots.txt
- optionaluserAgent: string = '*'
  relevant user agent, default to *
Returns boolean

parseSitemaps

parseSitemaps(): Promise<Sitemap>

Parse all the sitemaps referenced in the robots file.
Returns Promise<Sitemap>

parseUrlsFromSitemaps

parseUrlsFromSitemaps(): Promise<string[]>

Get all URLs from all the sitemaps referenced in the robots file. A shorthand for (await robots.parseSitemaps()).urls.
Returns Promise<string[]>

staticfind

find(url, proxyUrl): Promise<RobotsFile>

Determine the location of a robots.txt file for a URL and fetch it.
Parameters
- url: string
  the URL to fetch robots.txt for
- optionalproxyUrl: string
  a proxy to be used for fetching the robots.txt file
Returns Promise<RobotsFile>

staticfrom

from(url, content, proxyUrl): RobotsFile

Allows providing the URL and robots.txt content explicitly instead of loading it from the target site.
Parameters
- url: string
  the URL for robots.txt file
- content: string
  contents of robots.txt
- optionalproxyUrl: string
  a proxy to be used for fetching the robots.txt file
Returns RobotsFile

Index

Methods

Methods

getSitemaps

Returns string[]

isAllowed

Parameters

url: string

optionaluserAgent: string = '*'

Returns boolean

parseSitemaps

Returns Promise<Sitemap>

parseUrlsFromSitemaps

Returns Promise<string[]>

staticfind

Parameters

url: string

optionalproxyUrl: string

Returns Promise<RobotsFile>

staticfrom

Parameters

url: string

content: string

optionalproxyUrl: string

Returns RobotsFile