Skip to content

Commit

Permalink
feat: allow using other HTTP clients (#2661)
Browse files Browse the repository at this point in the history
- closes #2659

See https://gist.github.com/janbuchar/3a4724927de2c3a0bb16c46bb5940236
for an example curl-impersonate client.

The following got-scraping options were ignored (they will still work,
but they're not part of the new interface):

- decompress,
- resolveBodyOnly,
- allowGetBody,
- dnsLookup,
- dnsCache,
- dnsLookupIpVersion,
- retry,
- hooks,
- parseJson,
- stringifyJson,
- request,
- cache,
- cacheOptions,
- http2
- https
- agent
- localAddress
- createConnection
- pagination
- setHost
- maxHeaderSize
- methodRewriting
- enableUnixSockets
- context

---------

Co-authored-by: Martin Adámek <[email protected]>
  • Loading branch information
janbuchar and B4nan authored Oct 23, 2024
1 parent 59b715e commit 568c655
Show file tree
Hide file tree
Showing 30 changed files with 1,115 additions and 98 deletions.
23 changes: 23 additions & 0 deletions docs/guides/custom-http-client/custom-http-client.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
id: custom-http-client
title: Using a custom HTTP client (Experimental)
description: Use a custom HTTP client for `sendRequest` and plain-HTTP crawling
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import ImplementationSource from '!!raw-loader!./implementation.ts';
import UsageSource from '!!raw-loader!./usage.ts';

The <ApiLink to="basic-crawler/class/BasicCrawler">`BasicCrawler`</ApiLink> class allows you to configure the HTTP client implementation using the `httpClient` constructor option. This might be useful for testing or if you need to swap out the default implementation based on `got-scraping` for something else, such as `curl-impersonate` or `axios`.

The HTTP client implementation needs to conform to the <ApiLink to="core/interface/BaseHttpClient">`BaseHttpClient`</ApiLink> interface. For a rough idea on how it might look, see a skeleton implementation that uses the standard `fetch` interface:

<CodeBlock language="ts">{ImplementationSource}</CodeBlock>

You may then instantiate it and pass to a crawler constructor:

<CodeBlock language="ts">{UsageSource}</CodeBlock>

Please note that the interface is experimental and it will likely change with Crawlee version 4.
122 changes: 122 additions & 0 deletions docs/guides/custom-http-client/implementation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import {
BaseHttpClient,
HttpRequest,
HttpResponse,
RedirectHandler,
ResponseTypes,
StreamingHttpResponse,
} from '@crawlee/core';
import { Readable } from 'node:stream';

class CustomHttpClient implements BaseHttpClient {
async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>> {
const requestHeaders = new Headers();
for (let [headerName, headerValues] of Object.entries(request.headers ?? {})) {
if (headerValues === undefined) {
continue;
}

if (!Array.isArray(headerValues)) {
headerValues = [headerValues];
}

for (const value of headerValues) {
requestHeaders.append(headerName, value);
}
}

const response = await fetch(request.url, {
method: request.method,
headers: requestHeaders,
body: request.body as string, // TODO implement stream/generator handling
signal: request.signal,
// TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
});

const headers: Record<string, string> = {};

response.headers.forEach((value, headerName) => {
headers[headerName] = value;
});

return {
complete: true,
request,
url: response.url,
statusCode: response.status,
redirectUrls: [], // TODO you need to handle redirects manually to track them
headers,
trailers: {}, // TODO not supported by fetch
ip: undefined,
body:
request.responseType === 'text'
? await response.text()
: request.responseType === 'json'
? await response.json()
: Buffer.from(await response.text()),
};
}

async stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
const fetchResponse = await fetch(request.url, {
method: request.method,
headers: new Headers(),
body: request.body as string, // TODO implement stream/generator handling
signal: request.signal,
// TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...)
});

const headers: Record<string, string> = {}; // TODO same as in sendRequest()

async function* read() {
const reader = fetchResponse.body?.getReader();

const stream = new ReadableStream({
start(controller) {
if (!reader) {
return null;
}
return pump();
function pump() {
return reader!.read().then(({ done, value }) => {
// When no more data needs to be consumed, close the stream
if (done) {
controller.close();
return;
}
// Enqueue the next data chunk into our target stream
controller.enqueue(value);
return pump();
});
}
},
});

for await (const chunk of stream) {
yield chunk;
}
}

const response = {
complete: false,
request,
url: fetchResponse.url,
statusCode: fetchResponse.status,
redirectUrls: [], // TODO you need to handle redirects manually to track them
headers,
trailers: {}, // TODO not supported by fetch
ip: undefined,
stream: Readable.from(read()),
get downloadProgress() {
return { percent: 0, transferred: 0 }; // TODO track this
},
get uploadProgress() {
return { percent: 0, transferred: 0 }; // TODO track this
},
};

return response;
}
}
6 changes: 6 additions & 0 deletions docs/guides/custom-http-client/usage.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
const crawler = new HttpCrawler({
httpClient: new CustomHttpClient(),
async requestHandler() {
/* ... */
},
});
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"@typescript-eslint/parser": "^7.18.0",
"@vitest/coverage-v8": "^2.0.0",
"apify": "*",
"apify-node-curl-impersonate": "^1.0.15",
"basic-auth-parser": "^0.0.2",
"body-parser": "^1.20.0",
"commitlint": "^19.0.0",
Expand Down
44 changes: 17 additions & 27 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import type {
StatisticState,
StatisticsOptions,
LoadedContext,
BaseHttpClient,
RestrictedCrawlingContext,
} from '@crawlee/core';
import {
Expand All @@ -50,17 +51,20 @@ import {
SessionPool,
Statistics,
validators,
GotScrapingHttpClient,
} from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS, gotScraping } from '@crawlee/utils';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { stringify } from 'csv-stringify/sync';
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { OptionsInit, Method } from 'got-scraping';
import type { OptionsInit, Method, GotResponse } from 'got-scraping';
import ow, { ArgumentError } from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';

import { createSendRequest } from './send-request';

export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary>
extends CrawlingContext<BasicCrawler, UserData> {
/**
Expand Down Expand Up @@ -351,6 +355,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* whether to output them to the Key-Value store.
*/
statisticsOptions?: StatisticsOptions;

/**
* HTTP client implementation for the `sendRequest` context helper and for plain HTTP crawling.
* Defaults to a new instance of {@apilink GotScrapingHttpClient}
*/
httpClient?: BaseHttpClient;
}

/**
Expand Down Expand Up @@ -496,6 +506,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected crawlingContexts = new Map<string, Context>();
protected autoscaledPoolOptions: AutoscaledPoolOptions;
protected events: EventManager;
protected httpClient: BaseHttpClient;
protected retryOnBlocked: boolean;
private _closeEvents?: boolean;

Expand Down Expand Up @@ -530,6 +541,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
statusMessageCallback: ow.optional.function,

retryOnBlocked: ow.optional.boolean,
httpClient: ow.optional.object,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
Expand Down Expand Up @@ -592,10 +604,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
statusMessageCallback,

statisticsOptions,
httpClient,
} = options;

this.requestList = requestList;
this.requestQueue = requestQueue;
this.httpClient = httpClient ?? new GotScrapingHttpClient();
this.log = log;
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
Expand Down Expand Up @@ -1273,31 +1287,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
addRequests: this.addRequests.bind(this),
pushData: this.pushData.bind(this),
useState: this.useState.bind(this),
sendRequest: async (overrideOptions?: OptionsInit) => {
const cookieJar = session
? {
getCookieString: async (url: string) => session!.getCookieString(url),
setCookie: async (rawCookie: string, url: string) => session!.setCookie(rawCookie, url),
...overrideOptions?.cookieJar,
}
: overrideOptions?.cookieJar;

return gotScraping({
url: request!.url,
method: request!.method as Method, // Narrow type to omit CONNECT
body: request!.payload,
headers: request!.headers,
proxyUrl: crawlingContext.proxyInfo?.url,
sessionToken: session,
responseType: 'text',
...overrideOptions,
retry: {
limit: 0,
...overrideOptions?.retry,
},
cookieJar,
});
},
sendRequest: createSendRequest(this.httpClient, request!, session, () => crawlingContext.proxyInfo?.url),
getKeyValueStore: async (idOrName?: string) => KeyValueStore.open(idOrName, { config: this.config }),
};

Expand Down
54 changes: 54 additions & 0 deletions packages/basic-crawler/src/internals/send-request.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import {
type Session,
type Request,
type BaseHttpClient,
type HttpRequestOptions,
processHttpRequestOptions,
} from '@crawlee/core';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { Method, GotResponse } from 'got-scraping';

/**
* Prepares a function to be used as the `sendRequest` context helper.
*
* @internal
* @param httpClient The HTTP client that will perform the requests.
* @param originRequest The crawling request being processed.
* @param session The user session associated with the current request.
* @param getProxyUrl A function that will return the proxy URL that should be used for handling the request.
*/
export function createSendRequest(
httpClient: BaseHttpClient,
originRequest: Request,
session: Session | undefined,
getProxyUrl: () => string | undefined,
) {
return async <Response = string>(
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
overrideOptions: Partial<HttpRequestOptions> = {},
): Promise<GotResponse<Response>> => {
const cookieJar = session
? {
getCookieString: async (url: string) => session.getCookieString(url),
setCookie: async (rawCookie: string, url: string) => session.setCookie(rawCookie, url),
...overrideOptions?.cookieJar,
}
: overrideOptions?.cookieJar;

const requestOptions = processHttpRequestOptions({
url: originRequest.url,
method: originRequest.method as Method, // Narrow type to omit CONNECT
headers: originRequest.headers,
proxyUrl: getProxyUrl(),
sessionToken: session,
responseType: 'text',
...overrideOptions,
cookieJar,
});

// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
requestOptions.body ??= originRequest.payload;

return httpClient.sendRequest<any>(requestOptions);
};
}
2 changes: 1 addition & 1 deletion packages/browser-pool/src/fingerprinting/hooks.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator';
import type { FingerprintInjector } from 'fingerprint-injector';

import type { BrowserPool } from '../browser-pool';
import { getGeneratorDefaultOptions } from './utils';
import type { BrowserController } from '../abstract-classes/browser-controller';
import type { BrowserPool } from '../browser-pool';
import type { LaunchContext } from '../launch-context';
import { PlaywrightPlugin } from '../playwright/playwright-plugin';
import { PuppeteerPlugin } from '../puppeteer/puppeteer-plugin';
Expand Down
15 changes: 8 additions & 7 deletions packages/core/src/cookie_utils.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import type { IncomingMessage } from 'node:http';

import type { BrowserLikeResponse, Dictionary, Cookie as CookieObject } from '@crawlee/types';
import type { Cookie as CookieObject } from '@crawlee/types';
import { Cookie, CookieJar } from 'tough-cookie';

import { log } from './log';
import { CookieParseError } from './session_pool/errors';

export interface ResponseLike {
url?: string | (() => string);
headers?: Record<string, string | string[] | undefined> | (() => Record<string, string | string[] | undefined>);
}

/**
* @internal
*/
export function getCookiesFromResponse(
response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary<string | string[]> },
): Cookie[] {
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
const cookieHeader = headers['set-cookie'] || '';
const cookieHeader = headers?.['set-cookie'] || '';

try {
return Array.isArray(cookieHeader)
Expand Down
Loading

0 comments on commit 568c655

Please sign in to comment.