Skip to content

Commit

Permalink
chore: Move extractUrlsFromCheerio to utils (#2265)
Browse files Browse the repository at this point in the history
  • Loading branch information
janbuchar authored Jan 3, 2024
1 parent f2a8746 commit 52b98e3
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 43 deletions.
1 change: 1 addition & 0 deletions packages/cheerio-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"dependencies": {
"@crawlee/http": "3.7.1",
"@crawlee/types": "3.7.1",
"@crawlee/utils": "3.7.1",
"cheerio": "^1.0.0-rc.12",
"htmlparser2": "^9.0.0",
"tslib": "^2.4.0"
Expand Down
32 changes: 1 addition & 31 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ import {
enqueueLinks,
Router,
resolveBaseUrlForEnqueueLinksFiltering,
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { extractUrlsFromCheerio } from '@crawlee/utils';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
import { DomHandler } from 'htmlparser2';
Expand Down Expand Up @@ -237,36 +237,6 @@ export async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, ori
});
}

/**
* Extracts URLs from a given Cheerio object.
* @ignore
*/
function extractUrlsFromCheerio($: cheerio.CheerioAPI, selector: string, baseUrl: string): string[] {
const base = $('base').attr('href');
const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl);

if (absoluteBaseUrl) {
baseUrl = absoluteBaseUrl;
}

return $(selector)
.map((_i, el) => $(el).attr('href'))
.get()
.filter((href) => !!href)
.map((href) => {
// Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later.
const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package.
if (!isHrefAbsolute && !baseUrl) {
throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. `
+ 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.');
}
return baseUrl
? tryAbsoluteURL(href, baseUrl)
: href;
})
.filter((href) => !!href) as string[];
}

/**
* Creates new {@apilink Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@apilink CheerioCrawler}.
Expand Down
13 changes: 2 additions & 11 deletions packages/core/src/enqueue_links/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import type { EnqueueLinksOptions } from './enqueue_links';
import type { RequestOptions } from '../request';
import { Request } from '../request';

export { tryAbsoluteURL } from '@crawlee/utils';

const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000;

/**
Expand Down Expand Up @@ -245,17 +247,6 @@ export function createRequestOptions(
});
}

/**
* Helper function used to validate URLs used when extracting URLs from a page
*/
export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined {
try {
return (new URL(href, baseUrl)).href;
} catch {
return undefined;
}
}

/**
* Takes an Apify {@apilink RequestOptions} object and changes its attributes in a desired way. This user-function is used
* {@apilink enqueueLinks} to modify requests before enqueuing them.
Expand Down
39 changes: 38 additions & 1 deletion packages/utils/src/internals/cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import type { Dictionary } from '@crawlee/types';
import type { load } from 'cheerio';
import type { load, CheerioAPI } from 'cheerio';
import cheerio from 'cheerio';

import { tryAbsoluteURL } from './extract-urls';

export type CheerioRoot = ReturnType<typeof load>;

// NOTE: We are skipping 'noscript' since it's content is evaluated as text, instead of HTML elements. That damages the results.
Expand Down Expand Up @@ -77,3 +79,38 @@ export function htmlToText(htmlOrCheerioElement: string | CheerioRoot): string {

return text.trim();
}

/**
* Extracts URLs from a given Cheerio object.
*
* @param $ the Cheerio object to extract URLs from
* @param selector a CSS selector for matching link elements
* @param baseUrl a URL for resolving relative links
* @throws when a relative URL is encountered with no baseUrl set
* @return An array of absolute URLs
*/
export function extractUrlsFromCheerio($: CheerioAPI, selector: string = 'a', baseUrl: string = ''): string[] {
const base = $('base').attr('href');
const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl);

if (absoluteBaseUrl) {
baseUrl = absoluteBaseUrl;
}

return $(selector)
.map((_i, el) => $(el).attr('href'))
.get()
.filter(Boolean)
.map((href) => {
// Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later.
const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package.
if (!isHrefAbsolute && !baseUrl) {
throw new Error(`An extracted URL: ${href} is relative and baseUrl is not set. `
+ 'Provide a baseUrl to automatically resolve relative URLs.');
}
return baseUrl
? tryAbsoluteURL(href, baseUrl)
: href;
})
.filter(Boolean) as string[];
}
11 changes: 11 additions & 0 deletions packages/utils/src/internals/extract-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,14 @@ export function extractUrls(options: ExtractUrlsOptions): string[] {

return result;
}

/**
* Helper function used to validate URLs used when extracting URLs from a page
*/
export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined {
try {
return (new URL(href, baseUrl)).href;
} catch {
return undefined;
}
}
1 change: 1 addition & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ __metadata:
dependencies:
"@crawlee/http": "npm:3.7.1"
"@crawlee/types": "npm:3.7.1"
"@crawlee/utils": "npm:3.7.1"
cheerio: "npm:^1.0.0-rc.12"
htmlparser2: "npm:^9.0.0"
tslib: "npm:^2.4.0"
Expand Down

0 comments on commit 52b98e3

Please sign in to comment.