-
Notifications
You must be signed in to change notification settings - Fork 737
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: allow using other HTTP clients (#2661)
- closes #2659 See https://gist.github.com/janbuchar/3a4724927de2c3a0bb16c46bb5940236 for an example curl-impersonate client. The following got-scraping options were ignored (they will still work, but they're not part of the new interface): - decompress, - resolveBodyOnly, - allowGetBody, - dnsLookup, - dnsCache, - dnsLookupIpVersion, - retry, - hooks, - parseJson, - stringifyJson, - request, - cache, - cacheOptions, - http2 - https - agent - localAddress - createConnection - pagination - setHost - maxHeaderSize - methodRewriting - enableUnixSockets - context --------- Co-authored-by: Martin Adámek <[email protected]>
- Loading branch information
Showing
30 changed files
with
1,115 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
--- | ||
id: custom-http-client | ||
title: Using a custom HTTP client (Experimental) | ||
description: Use a custom HTTP client for `sendRequest` and plain-HTTP crawling | ||
--- | ||
|
||
import ApiLink from '@site/src/components/ApiLink'; | ||
import CodeBlock from '@theme/CodeBlock'; | ||
|
||
import ImplementationSource from '!!raw-loader!./implementation.ts'; | ||
import UsageSource from '!!raw-loader!./usage.ts'; | ||
|
||
The <ApiLink to="basic-crawler/class/BasicCrawler">`BasicCrawler`</ApiLink> class allows you to configure the HTTP client implementation using the `httpClient` constructor option. This might be useful for testing or if you need to swap out the default implementation based on `got-scraping` for something else, such as `curl-impersonate` or `axios`. | ||
|
||
The HTTP client implementation needs to conform to the <ApiLink to="core/interface/BaseHttpClient">`BaseHttpClient`</ApiLink> interface. For a rough idea on how it might look, see a skeleton implementation that uses the standard `fetch` interface: | ||
|
||
<CodeBlock language="ts">{ImplementationSource}</CodeBlock> | ||
|
||
You may then instantiate it and pass to a crawler constructor: | ||
|
||
<CodeBlock language="ts">{UsageSource}</CodeBlock> | ||
|
||
Please note that the interface is experimental and it will likely change with Crawlee version 4. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import { | ||
BaseHttpClient, | ||
HttpRequest, | ||
HttpResponse, | ||
RedirectHandler, | ||
ResponseTypes, | ||
StreamingHttpResponse, | ||
} from '@crawlee/core'; | ||
import { Readable } from 'node:stream'; | ||
|
||
class CustomHttpClient implements BaseHttpClient { | ||
async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>( | ||
request: HttpRequest<TResponseType>, | ||
): Promise<HttpResponse<TResponseType>> { | ||
const requestHeaders = new Headers(); | ||
for (let [headerName, headerValues] of Object.entries(request.headers ?? {})) { | ||
if (headerValues === undefined) { | ||
continue; | ||
} | ||
|
||
if (!Array.isArray(headerValues)) { | ||
headerValues = [headerValues]; | ||
} | ||
|
||
for (const value of headerValues) { | ||
requestHeaders.append(headerName, value); | ||
} | ||
} | ||
|
||
const response = await fetch(request.url, { | ||
method: request.method, | ||
headers: requestHeaders, | ||
body: request.body as string, // TODO implement stream/generator handling | ||
signal: request.signal, | ||
// TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...) | ||
}); | ||
|
||
const headers: Record<string, string> = {}; | ||
|
||
response.headers.forEach((value, headerName) => { | ||
headers[headerName] = value; | ||
}); | ||
|
||
return { | ||
complete: true, | ||
request, | ||
url: response.url, | ||
statusCode: response.status, | ||
redirectUrls: [], // TODO you need to handle redirects manually to track them | ||
headers, | ||
trailers: {}, // TODO not supported by fetch | ||
ip: undefined, | ||
body: | ||
request.responseType === 'text' | ||
? await response.text() | ||
: request.responseType === 'json' | ||
? await response.json() | ||
: Buffer.from(await response.text()), | ||
}; | ||
} | ||
|
||
async stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> { | ||
const fetchResponse = await fetch(request.url, { | ||
method: request.method, | ||
headers: new Headers(), | ||
body: request.body as string, // TODO implement stream/generator handling | ||
signal: request.signal, | ||
// TODO implement the rest of request parameters (e.g., timeout, proxyUrl, cookieJar, ...) | ||
}); | ||
|
||
const headers: Record<string, string> = {}; // TODO same as in sendRequest() | ||
|
||
async function* read() { | ||
const reader = fetchResponse.body?.getReader(); | ||
|
||
const stream = new ReadableStream({ | ||
start(controller) { | ||
if (!reader) { | ||
return null; | ||
} | ||
return pump(); | ||
function pump() { | ||
return reader!.read().then(({ done, value }) => { | ||
// When no more data needs to be consumed, close the stream | ||
if (done) { | ||
controller.close(); | ||
return; | ||
} | ||
// Enqueue the next data chunk into our target stream | ||
controller.enqueue(value); | ||
return pump(); | ||
}); | ||
} | ||
}, | ||
}); | ||
|
||
for await (const chunk of stream) { | ||
yield chunk; | ||
} | ||
} | ||
|
||
const response = { | ||
complete: false, | ||
request, | ||
url: fetchResponse.url, | ||
statusCode: fetchResponse.status, | ||
redirectUrls: [], // TODO you need to handle redirects manually to track them | ||
headers, | ||
trailers: {}, // TODO not supported by fetch | ||
ip: undefined, | ||
stream: Readable.from(read()), | ||
get downloadProgress() { | ||
return { percent: 0, transferred: 0 }; // TODO track this | ||
}, | ||
get uploadProgress() { | ||
return { percent: 0, transferred: 0 }; // TODO track this | ||
}, | ||
}; | ||
|
||
return response; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
const crawler = new HttpCrawler({ | ||
httpClient: new CustomHttpClient(), | ||
async requestHandler() { | ||
/* ... */ | ||
}, | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import { | ||
type Session, | ||
type Request, | ||
type BaseHttpClient, | ||
type HttpRequestOptions, | ||
processHttpRequestOptions, | ||
} from '@crawlee/core'; | ||
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood | ||
import type { Method, GotResponse } from 'got-scraping'; | ||
|
||
/** | ||
* Prepares a function to be used as the `sendRequest` context helper. | ||
* | ||
* @internal | ||
* @param httpClient The HTTP client that will perform the requests. | ||
* @param originRequest The crawling request being processed. | ||
* @param session The user session associated with the current request. | ||
* @param getProxyUrl A function that will return the proxy URL that should be used for handling the request. | ||
*/ | ||
export function createSendRequest( | ||
httpClient: BaseHttpClient, | ||
originRequest: Request, | ||
session: Session | undefined, | ||
getProxyUrl: () => string | undefined, | ||
) { | ||
return async <Response = string>( | ||
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4 | ||
overrideOptions: Partial<HttpRequestOptions> = {}, | ||
): Promise<GotResponse<Response>> => { | ||
const cookieJar = session | ||
? { | ||
getCookieString: async (url: string) => session.getCookieString(url), | ||
setCookie: async (rawCookie: string, url: string) => session.setCookie(rawCookie, url), | ||
...overrideOptions?.cookieJar, | ||
} | ||
: overrideOptions?.cookieJar; | ||
|
||
const requestOptions = processHttpRequestOptions({ | ||
url: originRequest.url, | ||
method: originRequest.method as Method, // Narrow type to omit CONNECT | ||
headers: originRequest.headers, | ||
proxyUrl: getProxyUrl(), | ||
sessionToken: session, | ||
responseType: 'text', | ||
...overrideOptions, | ||
cookieJar, | ||
}); | ||
|
||
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand | ||
requestOptions.body ??= originRequest.payload; | ||
|
||
return httpClient.sendRequest<any>(requestOptions); | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.