feat: allow using other HTTP clients (#2661)

- closes #2659 See https://gist.github.com/janbuchar/3a4724927de2c3a0bb16c46bb5940236 for an example curl-impersonate client. The following got-scraping options were ignored (they will still work, but they're not part of the new interface): - decompress, - resolveBodyOnly, - allowGetBody, - dnsLookup, - dnsCache, - dnsLookupIpVersion, - retry, - hooks, - parseJson, - stringifyJson, - request, - cache, - cacheOptions, - http2 - https - agent - localAddress - createConnection - pagination - setHost - maxHeaderSize - methodRewriting - enableUnixSockets - context --------- Co-authored-by: Martin Adámek <[email protected]>
apify · Oct 23, 2024 · 568c655 · 568c655
1 parent 59b715e
commit 568c655
Show file tree

Hide file tree

Showing 30 changed files with 1,115 additions and 98 deletions.
diff --git a/docs/guides/custom-http-client/custom-http-client.mdx b/docs/guides/custom-http-client/custom-http-client.mdx
@@ -0,0 +1,23 @@
+---
+id: custom-http-client
+title: Using a custom HTTP client (Experimental)
+description: Use a custom HTTP client for `sendRequest` and plain-HTTP crawling
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import ImplementationSource from '!!raw-loader!./implementation.ts';
+import UsageSource from '!!raw-loader!./usage.ts';
+
+The <ApiLink to="basic-crawler/class/BasicCrawler">`BasicCrawler`</ApiLink> class allows you to configure the HTTP client implementation using the `httpClient` constructor option. This might be useful for testing or if you need to swap out the default implementation based on `got-scraping` for something else, such as `curl-impersonate` or `axios`.
+
+The HTTP client implementation needs to conform to the <ApiLink to="core/interface/BaseHttpClient">`BaseHttpClient`</ApiLink> interface. For a rough idea on how it might look, see a skeleton implementation that uses the standard `fetch` interface:
+
+<CodeBlock language="ts">{ImplementationSource}</CodeBlock>
+
+You may then instantiate it and pass to a crawler constructor:
+
+<CodeBlock language="ts">{UsageSource}</CodeBlock>
+
+Please note that the interface is experimental and it will likely change with Crawlee version 4.
diff --git a/docs/guides/custom-http-client/implementation.ts b/docs/guides/custom-http-client/implementation.ts
@@ -0,0 +1,122 @@
+import {
+    BaseHttpClient,
+    HttpRequest,
+    HttpResponse,
+    RedirectHandler,
+    ResponseTypes,
+    StreamingHttpResponse,
+} from '@crawlee/core';
+import { Readable } from 'node:stream';
+
+class CustomHttpClient implements BaseHttpClient {
+    async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
+        request: HttpRequest<TResponseType>,
+    ): Promise<HttpResponse<TResponseType>> {
+        const requestHeaders = new Headers();
+        for (let [headerName, headerValues] of Object.entries(request.headers ?? {})) {
+            if (headerValues === undefined) {
+                continue;
+            }
+
+            if (!Array.isArray(headerValues)) {
+                headerValues = [headerValues];
+            }
+
+            for (const value of headerValues) {
+                requestHeaders.append(headerName, value);
+            }
+        }
+
+        const response = await fetch(request.url, {
+            method: request.method,
+            headers: requestHeaders,
+            body: request.body as string, // TODO implement stream/generator handling
+            signal: request.signal,
+            // TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
+        });
+
+        const headers: Record<string, string> = {};
+
+        response.headers.forEach((value, headerName) => {
+            headers[headerName] = value;
+        });
+
+        return {
+            complete: true,
+            request,
+            url: response.url,
+            statusCode: response.status,
+            redirectUrls: [], // TODO you need to handle redirects manually to track them
+            headers,
+            trailers: {}, // TODO not supported by fetch
+            ip: undefined,
+            body:
+                request.responseType === 'text'
+                    ? await response.text()
+                    : request.responseType === 'json'
+                      ? await response.json()
+                      : Buffer.from(await response.text()),
+        };
+    }
+
+    async stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
+        const fetchResponse = await fetch(request.url, {
+            method: request.method,
+            headers: new Headers(),
+            body: request.body as string, // TODO implement stream/generator handling
+            signal: request.signal,
+            // TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
+        });
+
+        const headers: Record<string, string> = {}; // TODO same as in sendRequest()
+
+        async function* read() {
+            const reader = fetchResponse.body?.getReader();
+
+            const stream = new ReadableStream({
+                start(controller) {
+                    if (!reader) {
+                        return null;
+                    }
+                    return pump();
+                    function pump() {
+                        return reader!.read().then(({ done, value }) => {
+                            // When no more data needs to be consumed, close the stream
+                            if (done) {
+                                controller.close();
+                                return;
+                            }
+                            // Enqueue the next data chunk into our target stream
+                            controller.enqueue(value);
+                            return pump();
+                        });
+                    }
+                },
+            });
+
+            for await (const chunk of stream) {
+                yield chunk;
+            }
+        }
+
+        const response = {
+            complete: false,
+            request,
+            url: fetchResponse.url,
+            statusCode: fetchResponse.status,
+            redirectUrls: [], // TODO you need to handle redirects manually to track them
+            headers,
+            trailers: {}, // TODO not supported by fetch
+            ip: undefined,
+            stream: Readable.from(read()),
+            get downloadProgress() {
+                return { percent: 0, transferred: 0 }; // TODO track this
+            },
+            get uploadProgress() {
+                return { percent: 0, transferred: 0 }; // TODO track this
+            },
+        };
+
+        return response;
+    }
+}
diff --git a/docs/guides/custom-http-client/usage.ts b/docs/guides/custom-http-client/usage.ts
@@ -0,0 +1,6 @@
+const crawler = new HttpCrawler({
+    httpClient: new CustomHttpClient(),
+    async requestHandler() {
+        /* ... */
+    },
+});
diff --git a/package.json b/package.json
@@ -85,6 +85,7 @@
         "@typescript-eslint/parser": "^7.18.0",
         "@vitest/coverage-v8": "^2.0.0",
         "apify": "*",
+        "apify-node-curl-impersonate": "^1.0.15",
         "basic-auth-parser": "^0.0.2",
         "body-parser": "^1.20.0",
         "commitlint": "^19.0.0",

diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -26,6 +26,7 @@ import type {
     StatisticState,
     StatisticsOptions,
     LoadedContext,
+    BaseHttpClient,
     RestrictedCrawlingContext,
 } from '@crawlee/core';
 import {
@@ -50,17 +51,20 @@ import {
     SessionPool,
     Statistics,
     validators,
+    GotScrapingHttpClient,
 } from '@crawlee/core';
 import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
-import { ROTATE_PROXY_ERRORS, gotScraping } from '@crawlee/utils';
+import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
 import { stringify } from 'csv-stringify/sync';
 import { ensureDir, writeFile, writeJSON } from 'fs-extra';
 // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
-import type { OptionsInit, Method } from 'got-scraping';
+import type { OptionsInit, Method, GotResponse } from 'got-scraping';
 import ow, { ArgumentError } from 'ow';
 import { getDomain } from 'tldts';
 import type { SetRequired } from 'type-fest';
 
+import { createSendRequest } from './send-request';
+
 export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary>
     extends CrawlingContext<BasicCrawler, UserData> {
     /**
@@ -351,6 +355,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * whether to output them to the Key-Value store.
      */
     statisticsOptions?: StatisticsOptions;
+
+    /**
+     * HTTP client implementation for the `sendRequest` context helper and for plain HTTP crawling.
+     * Defaults to a new instance of {@apilink GotScrapingHttpClient}
+     */
+    httpClient?: BaseHttpClient;
 }
 
 /**
@@ -496,6 +506,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     protected crawlingContexts = new Map<string, Context>();
     protected autoscaledPoolOptions: AutoscaledPoolOptions;
     protected events: EventManager;
+    protected httpClient: BaseHttpClient;
     protected retryOnBlocked: boolean;
     private _closeEvents?: boolean;
 
@@ -530,6 +541,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         statusMessageCallback: ow.optional.function,
 
         retryOnBlocked: ow.optional.boolean,
+        httpClient: ow.optional.object,
 
         // AutoscaledPool shorthands
         minConcurrency: ow.optional.number,
@@ -592,10 +604,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             statusMessageCallback,
 
             statisticsOptions,
+            httpClient,
         } = options;
 
         this.requestList = requestList;
         this.requestQueue = requestQueue;
+        this.httpClient = httpClient ?? new GotScrapingHttpClient();
         this.log = log;
         this.statusMessageLoggingInterval = statusMessageLoggingInterval;
         this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
@@ -1273,31 +1287,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             addRequests: this.addRequests.bind(this),
             pushData: this.pushData.bind(this),
             useState: this.useState.bind(this),
-            sendRequest: async (overrideOptions?: OptionsInit) => {
-                const cookieJar = session
-                    ? {
-                          getCookieString: async (url: string) => session!.getCookieString(url),
-                          setCookie: async (rawCookie: string, url: string) => session!.setCookie(rawCookie, url),
-                          ...overrideOptions?.cookieJar,
-                      }
-                    : overrideOptions?.cookieJar;
-
-                return gotScraping({
-                    url: request!.url,
-                    method: request!.method as Method, // Narrow type to omit CONNECT
-                    body: request!.payload,
-                    headers: request!.headers,
-                    proxyUrl: crawlingContext.proxyInfo?.url,
-                    sessionToken: session,
-                    responseType: 'text',
-                    ...overrideOptions,
-                    retry: {
-                        limit: 0,
-                        ...overrideOptions?.retry,
-                    },
-                    cookieJar,
-                });
-            },
+            sendRequest: createSendRequest(this.httpClient, request!, session, () => crawlingContext.proxyInfo?.url),
             getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }),
         };
 

diff --git a/packages/basic-crawler/src/internals/send-request.ts b/packages/basic-crawler/src/internals/send-request.ts
@@ -0,0 +1,54 @@
+import {
+    type Session,
+    type Request,
+    type BaseHttpClient,
+    type HttpRequestOptions,
+    processHttpRequestOptions,
+} from '@crawlee/core';
+// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
+import type { Method, GotResponse } from 'got-scraping';
+
+/**
+ * Prepares a function to be used as the `sendRequest` context helper.
+ *
+ * @internal
+ * @param httpClient The HTTP client that will perform the requests.
+ * @param originRequest The crawling request being processed.
+ * @param session The user session associated with the current request.
+ * @param getProxyUrl A function that will return the proxy URL that should be used for handling the request.
+ */
+export function createSendRequest(
+    httpClient: BaseHttpClient,
+    originRequest: Request,
+    session: Session | undefined,
+    getProxyUrl: () => string | undefined,
+) {
+    return async <Response = string>(
+        // TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
+        overrideOptions: Partial<HttpRequestOptions> = {},
+    ): Promise<GotResponse<Response>> => {
+        const cookieJar = session
+            ? {
+                  getCookieString: async (url: string) => session.getCookieString(url),
+                  setCookie: async (rawCookie: string, url: string) => session.setCookie(rawCookie, url),
+                  ...overrideOptions?.cookieJar,
+              }
+            : overrideOptions?.cookieJar;
+
+        const requestOptions = processHttpRequestOptions({
+            url: originRequest.url,
+            method: originRequest.method as Method, // Narrow type to omit CONNECT
+            headers: originRequest.headers,
+            proxyUrl: getProxyUrl(),
+            sessionToken: session,
+            responseType: 'text',
+            ...overrideOptions,
+            cookieJar,
+        });
+
+        // Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
+        requestOptions.body ??= originRequest.payload;
+
+        return httpClient.sendRequest<any>(requestOptions);
+    };
+}
diff --git a/packages/browser-pool/src/fingerprinting/hooks.ts b/packages/browser-pool/src/fingerprinting/hooks.ts
@@ -1,9 +1,9 @@
 import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator';
 import type { FingerprintInjector } from 'fingerprint-injector';
 
-import type { BrowserPool } from '../browser-pool';
 import { getGeneratorDefaultOptions } from './utils';
 import type { BrowserController } from '../abstract-classes/browser-controller';
+import type { BrowserPool } from '../browser-pool';
 import type { LaunchContext } from '../launch-context';
 import { PlaywrightPlugin } from '../playwright/playwright-plugin';
 import { PuppeteerPlugin } from '../puppeteer/puppeteer-plugin';

diff --git a/packages/core/src/cookie_utils.ts b/packages/core/src/cookie_utils.ts
@@ -1,19 +1,20 @@
-import type { IncomingMessage } from 'node:http';
-
-import type { BrowserLikeResponse, Dictionary, Cookie as CookieObject } from '@crawlee/types';
+import type { Cookie as CookieObject } from '@crawlee/types';
 import { Cookie, CookieJar } from 'tough-cookie';
 
 import { log } from './log';
 import { CookieParseError } from './session_pool/errors';
 
+export interface ResponseLike {
+    url?: string | (() => string);
+    headers?: Record<string, string | string[] | undefined> | (() => Record<string, string | string[] | undefined>);
+}
+
 /**
  * @internal
  */
-export function getCookiesFromResponse(
-    response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary<string | string[]> },
-): Cookie[] {
+export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
     const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
-    const cookieHeader = headers['set-cookie'] || '';
+    const cookieHeader = headers?.['set-cookie'] || '';
 
     try {
         return Array.isArray(cookieHeader)