Version: 3.15

Using a custom HTTP client (Experimental)

Copy for LLM

The BasicCrawler class allows you to configure the HTTP client implementation using the httpClient constructor option. This might be useful for testing or if you need to swap out the default implementation based on got-scraping for something else, such as curl-impersonate or axios.

The HTTP client implementation needs to conform to the BaseHttpClient interface. For a rough idea on how it might look, see a skeleton implementation that uses the standard fetch interface:

import type {
    BaseHttpClient,
    HttpRequest,
    HttpResponse,
    RedirectHandler,
    ResponseTypes,
    StreamingHttpResponse,
} from '@crawlee/core';
import { Readable } from 'node:stream';

class CustomHttpClient implements BaseHttpClient {
    async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
        request: HttpRequest<TResponseType>,
    ): Promise<HttpResponse<TResponseType>> {
        const requestHeaders = new Headers();
        for (let [headerName, headerValues] of Object.entries(request.headers ?? {})) {
            if (headerValues === undefined) {
                continue;
            }

            if (!Array.isArray(headerValues)) {
                headerValues = [headerValues];
            }

            for (const value of headerValues) {
                requestHeaders.append(headerName, value);
            }
        }

        const response = await fetch(request.url, {
            method: request.method,
            headers: requestHeaders,
            body: request.body as string, // TODO implement stream/generator handling
            signal: request.signal,
            // TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
        });

        const headers: Record<string, string> = {};

        response.headers.forEach((value, headerName) => {
            headers[headerName] = value;
        });

        return {
            complete: true,
            request,
            url: response.url,
            statusCode: response.status,
            redirectUrls: [], // TODO you need to handle redirects manually to track them
            headers,
            trailers: {}, // TODO not supported by fetch
            ip: undefined,
            body:
                request.responseType === 'text'
                    ? await response.text()
                    : request.responseType === 'json'
                      ? await response.json()
                      : Buffer.from(await response.text()),
        };
    }

    async stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
        const fetchResponse = await fetch(request.url, {
            method: request.method,
            headers: new Headers(),
            body: request.body as string, // TODO implement stream/generator handling
            signal: request.signal,
            // TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
        });

        const headers: Record<string, string> = {}; // TODO same as in sendRequest()

        async function* read() {
            const reader = fetchResponse.body?.getReader();

            const stream = new ReadableStream({
                start(controller) {
                    if (!reader) {
                        return null;
                    }
                    return pump();
                    function pump() {
                        return reader!.read().then(({ done, value }) => {
                            // When no more data needs to be consumed, close the stream
                            if (done) {
                                controller.close();
                                return;
                            }
                            // Enqueue the next data chunk into our target stream
                            controller.enqueue(value);
                            return pump();
                        });
                    }
                },
            });

            for await (const chunk of stream) {
                yield chunk;
            }
        }

        const response = {
            complete: false,
            request,
            url: fetchResponse.url,
            statusCode: fetchResponse.status,
            redirectUrls: [], // TODO you need to handle redirects manually to track them
            headers,
            trailers: {}, // TODO not supported by fetch
            ip: undefined,
            stream: Readable.from(read()),
            get downloadProgress() {
                return { percent: 0, transferred: 0 }; // TODO track this
            },
            get uploadProgress() {
                return { percent: 0, transferred: 0 }; // TODO track this
            },
        };

        return response;
    }
}

You may then instantiate it and pass to a crawler constructor:

const crawler = new HttpCrawler({
    httpClient: new CustomHttpClient(),
    async requestHandler() {
        /* ... */
    },
});

Please note that the interface is experimental and it will likely change with Crawlee version 4.