From 52b98e3e997680e352da5763b394750b19110953 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 3 Jan 2024 13:56:11 +0100 Subject: [PATCH] chore: Move extractUrlsFromCheerio to utils (#2265) --- packages/cheerio-crawler/package.json | 1 + .../src/internals/cheerio-crawler.ts | 32 +-------------- packages/core/src/enqueue_links/shared.ts | 13 +------ packages/utils/src/internals/cheerio.ts | 39 ++++++++++++++++++- packages/utils/src/internals/extract-urls.ts | 11 ++++++ yarn.lock | 1 + 6 files changed, 54 insertions(+), 43 deletions(-) diff --git a/packages/cheerio-crawler/package.json b/packages/cheerio-crawler/package.json index 1e1420ac63ea..44c784c246df 100644 --- a/packages/cheerio-crawler/package.json +++ b/packages/cheerio-crawler/package.json @@ -55,6 +55,7 @@ "dependencies": { "@crawlee/http": "3.7.1", "@crawlee/types": "3.7.1", + "@crawlee/utils": "3.7.1", "cheerio": "^1.0.0-rc.12", "htmlparser2": "^9.0.0", "tslib": "^2.4.0" diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index b45625f600a8..205aa904d350 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -17,9 +17,9 @@ import { enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering, - tryAbsoluteURL, } from '@crawlee/http'; import type { Dictionary } from '@crawlee/types'; +import { extractUrlsFromCheerio } from '@crawlee/utils'; import type { CheerioOptions } from 'cheerio'; import * as cheerio from 'cheerio'; import { DomHandler } from 'htmlparser2'; @@ -237,36 +237,6 @@ export async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, ori }); } -/** - * Extracts URLs from a given Cheerio object. - * @ignore - */ -function extractUrlsFromCheerio($: cheerio.CheerioAPI, selector: string, baseUrl: string): string[] { - const base = $('base').attr('href'); - const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl); - - if (absoluteBaseUrl) { - baseUrl = absoluteBaseUrl; - } - - return $(selector) - .map((_i, el) => $(el).attr('href')) - .get() - .filter((href) => !!href) - .map((href) => { - // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. - const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. - if (!isHrefAbsolute && !baseUrl) { - throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. ` - + 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.'); - } - return baseUrl - ? tryAbsoluteURL(href, baseUrl) - : href; - }) - .filter((href) => !!href) as string[]; -} - /** * Creates new {@apilink Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@apilink CheerioCrawler}. diff --git a/packages/core/src/enqueue_links/shared.ts b/packages/core/src/enqueue_links/shared.ts index d7c1694ab458..a9197e8c0832 100644 --- a/packages/core/src/enqueue_links/shared.ts +++ b/packages/core/src/enqueue_links/shared.ts @@ -7,6 +7,8 @@ import type { EnqueueLinksOptions } from './enqueue_links'; import type { RequestOptions } from '../request'; import { Request } from '../request'; +export { tryAbsoluteURL } from '@crawlee/utils'; + const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000; /** @@ -245,17 +247,6 @@ export function createRequestOptions( }); } -/** - * Helper function used to validate URLs used when extracting URLs from a page - */ -export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined { - try { - return (new URL(href, baseUrl)).href; - } catch { - return undefined; - } -} - /** * Takes an Apify {@apilink RequestOptions} object and changes its attributes in a desired way. This user-function is used * {@apilink enqueueLinks} to modify requests before enqueuing them. diff --git a/packages/utils/src/internals/cheerio.ts b/packages/utils/src/internals/cheerio.ts index 53354a9bd093..f306db4bec80 100644 --- a/packages/utils/src/internals/cheerio.ts +++ b/packages/utils/src/internals/cheerio.ts @@ -1,7 +1,9 @@ import type { Dictionary } from '@crawlee/types'; -import type { load } from 'cheerio'; +import type { load, CheerioAPI } from 'cheerio'; import cheerio from 'cheerio'; +import { tryAbsoluteURL } from './extract-urls'; + export type CheerioRoot = ReturnType; // NOTE: We are skipping 'noscript' since it's content is evaluated as text, instead of HTML elements. That damages the results. @@ -77,3 +79,38 @@ export function htmlToText(htmlOrCheerioElement: string | CheerioRoot): string { return text.trim(); } + +/** + * Extracts URLs from a given Cheerio object. + * + * @param $ the Cheerio object to extract URLs from + * @param selector a CSS selector for matching link elements + * @param baseUrl a URL for resolving relative links + * @throws when a relative URL is encountered with no baseUrl set + * @return An array of absolute URLs + */ +export function extractUrlsFromCheerio($: CheerioAPI, selector: string = 'a', baseUrl: string = ''): string[] { + const base = $('base').attr('href'); + const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl); + + if (absoluteBaseUrl) { + baseUrl = absoluteBaseUrl; + } + + return $(selector) + .map((_i, el) => $(el).attr('href')) + .get() + .filter(Boolean) + .map((href) => { + // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. + const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. + if (!isHrefAbsolute && !baseUrl) { + throw new Error(`An extracted URL: ${href} is relative and baseUrl is not set. ` + + 'Provide a baseUrl to automatically resolve relative URLs.'); + } + return baseUrl + ? tryAbsoluteURL(href, baseUrl) + : href; + }) + .filter(Boolean) as string[]; +} diff --git a/packages/utils/src/internals/extract-urls.ts b/packages/utils/src/internals/extract-urls.ts index 385c7eb0a99b..8d1d914333d0 100644 --- a/packages/utils/src/internals/extract-urls.ts +++ b/packages/utils/src/internals/extract-urls.ts @@ -83,3 +83,14 @@ export function extractUrls(options: ExtractUrlsOptions): string[] { return result; } + +/** + * Helper function used to validate URLs used when extracting URLs from a page + */ +export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined { + try { + return (new URL(href, baseUrl)).href; + } catch { + return undefined; + } +} diff --git a/yarn.lock b/yarn.lock index add7037ce174..7cf20981a9a9 100644 --- a/yarn.lock +++ b/yarn.lock @@ -471,6 +471,7 @@ __metadata: dependencies: "@crawlee/http": "npm:3.7.1" "@crawlee/types": "npm:3.7.1" + "@crawlee/utils": "npm:3.7.1" cheerio: "npm:^1.0.0-rc.12" htmlparser2: "npm:^9.0.0" tslib: "npm:^2.4.0"