Skip to content

Commit

Permalink
feat: robots.txt and sitemap.xml utils (#2214)
Browse files Browse the repository at this point in the history
Closes #2187

---------

Co-authored-by: Jindřich Bär <[email protected]>
Co-authored-by: Martin Adámek <[email protected]>
Co-authored-by: Vlad Frangu <[email protected]>
  • Loading branch information
4 people authored Dec 19, 2023
1 parent c1cfd45 commit fdfec4f
Show file tree
Hide file tree
Showing 13 changed files with 462 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docs/examples/crawl_sitemap.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import CheerioSource from '!!raw-loader!roa-loader!./crawl_sitemap_cheerio.ts';
import PuppeteerSource from '!!raw-loader!roa-loader!./crawl_sitemap_puppeteer.ts';
import PlaywrightSource from '!!raw-loader!roa-loader!./crawl_sitemap_playwright.ts';

This example downloads and crawls the URLs from a sitemap, by using the <ApiLink to="utils/function/downloadListOfUrls">`downloadListOfUrls`</ApiLink> utility method provided by the <ApiLink to="utils">`@crawlee/utils`</ApiLink> module.
This example downloads and crawls the URLs from a sitemap, by using the <ApiLink to="utils/class/Sitemap">`Sitemap`</ApiLink> utility class provided by the <ApiLink to="utils">`@crawlee/utils`</ApiLink> module.

<Tabs groupId="crawler-type">

Expand Down
6 changes: 3 additions & 3 deletions docs/examples/crawl_sitemap_cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { CheerioCrawler, downloadListOfUrls } from 'crawlee';
import { CheerioCrawler, Sitemap } from 'crawlee';

const crawler = new CheerioCrawler({
// Function called for each URL
Expand All @@ -8,9 +8,9 @@ const crawler = new CheerioCrawler({
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');

await crawler.addRequests(listOfUrls);
await crawler.addRequests(urls);

// Run the crawler
await crawler.run();
6 changes: 3 additions & 3 deletions docs/examples/crawl_sitemap_playwright.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { PlaywrightCrawler, downloadListOfUrls } from 'crawlee';
import { PlaywrightCrawler, Sitemap } from 'crawlee';

const crawler = new PlaywrightCrawler({
// Function called for each URL
Expand All @@ -8,9 +8,9 @@ const crawler = new PlaywrightCrawler({
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');

await crawler.addRequests(listOfUrls);
await crawler.addRequests(urls);

// Run the crawler
await crawler.run();
6 changes: 3 additions & 3 deletions docs/examples/crawl_sitemap_puppeteer.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { PuppeteerCrawler, downloadListOfUrls } from 'crawlee';
import { PuppeteerCrawler, Sitemap } from 'crawlee';

const crawler = new PuppeteerCrawler({
// Function called for each URL
Expand All @@ -8,9 +8,9 @@ const crawler = new PuppeteerCrawler({
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');

await crawler.addRequests(listOfUrls);
await crawler.addRequests(urls);

// Run the crawler
await crawler.run();
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"@types/proper-lockfile": "^4.1.2",
"@types/ps-tree": "^1.1.2",
"@types/rimraf": "^3.0.2",
"@types/sax": "^1.0.0",
"@types/semver": "^7.3.12",
"@types/stream-json": "^1.7.2",
"@types/tough-cookie": "^4.0.2",
Expand All @@ -95,6 +96,7 @@
"is-ci": "^3.0.1",
"lerna": "^8.0.0",
"lint-staged": "^15.0.0",
"nock": "^13.4.0",
"playwright": "1.40.1",
"portastic": "^1.0.1",
"proxy": "^1.0.2",
Expand Down
4 changes: 3 additions & 1 deletion packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@
"@apify/ps-tree": "^1.2.0",
"@crawlee/types": "3.6.2",
"cheerio": "^1.0.0-rc.12",
"got-scraping": "^4.0.0",
"got-scraping": "^4.0.3",
"ow": "^0.28.1",
"robots-parser": "^3.0.1",
"sax": "^1.3.0",
"tslib": "^2.4.0"
}
}
2 changes: 2 additions & 0 deletions packages/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@ export * from './internals/typedefs';
export * from './internals/error_tracker';
export * from './internals/open_graph_parser';
export * from './internals/gotScraping';
export * from './internals/robots';
export * from './internals/sitemap';

export { Dictionary, Awaitable, Constructor } from '@crawlee/types';
98 changes: 98 additions & 0 deletions packages/utils/src/internals/robots.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { HTTPError as HTTPErrorClass } from 'got-scraping';
import type { Robot } from 'robots-parser';
import robotsParser from 'robots-parser';

import { gotScraping } from './gotScraping';
import { Sitemap } from './sitemap';

let HTTPError: typeof HTTPErrorClass;

/**
* Loads and queries information from a robots.txt file.
*
* **Example usage:**
* ```javascript
* // Load the robots.txt file
* const robots = await RobotsFile.load('https://crawlee.dev/docs/introduction/first-crawler');
*
* // Check if a URL should be crawled according to robots.txt
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
* if (robots.isAllowed(url)) {
* await crawler.addRequests([url]);
* }
*
* // Enqueue all links in the sitemap(s)
* await crawler.addRequests(await robots.parseUrlsFromSitemaps());
* ```
*/
export class RobotsFile {
private constructor(
private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>,
private proxyUrl?: string,
) {}

/**
* Determine the location of a robots.txt file for a URL and fetch it.
* @param url the URL to fetch robots.txt for
* @param proxyUrl a proxy to be used for fetching the robots.txt file
*/
static async find(url: string, proxyUrl?: string): Promise<RobotsFile> {
const robotsFileUrl = new URL(url);
robotsFileUrl.pathname = '/robots.txt';
robotsFileUrl.search = '';

return RobotsFile.load(robotsFileUrl.toString(), proxyUrl);
}

protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> {
if (!HTTPError) {
HTTPError = (await import('got-scraping')).HTTPError;
}

try {
const response = await gotScraping({
url,
proxyUrl,
method: 'GET',
responseType: 'text',
});

return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl);
} catch (e) {
if (e instanceof HTTPError && e.response.statusCode === 404) {
return new RobotsFile({ isAllowed() { return true; }, getSitemaps() { return []; } }, proxyUrl);
}
throw e;
}
}

/**
* Check if a URL should be crawled by robots.
* @param url the URL to check against the rules in robots.txt
*/
isAllowed(url: string): boolean {
return this.robots.isAllowed(url, '*') ?? false;
}

/**
* Get URLs of sitemaps referenced in the robots file.
*/
getSitemaps(): string[] {
return this.robots.getSitemaps();
}

/**
* Parse all the sitemaps referenced in the robots file.
*/
async parseSitemaps(): Promise<Sitemap> {
return Sitemap.load(this.robots.getSitemaps(), this.proxyUrl);
}

/**
* Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`.
*/
async parseUrlsFromSitemaps(): Promise<string[]> {
return (await this.parseSitemaps()).urls;
}
}
116 changes: 116 additions & 0 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import type { Duplex } from 'node:stream';
import { createGunzip } from 'node:zlib';

import log from '@apify/log';
import type { SAXStream } from 'sax';
import sax from 'sax';

class ParsingState {
sitemapUrls: string[] = [];
urls: string[] = [];
visitedSitemapUrls: string[] = [];
context?: 'sitemapindex' | 'urlset';
loc = false;

resetContext() {
this.context = undefined;
this.loc = false;
}
}

/**
* Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs.
*
* **Example usage:**
* ```javascript
* // Load a sitemap
* const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']);
*
* // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes)
* await crawler.addRequests(sitemap.urls);
* ```
*/
export class Sitemap {
constructor(readonly urls: string[]) {}

protected static createParser(parsingState: ParsingState, onEnd: () => void, onError: (error: Error) => void): SAXStream {
const parser = sax.createStream(true);

parser.on('opentag', (node) => {
if (node.name === 'loc' && parsingState.context !== undefined) {
parsingState.loc = true;
}
if (node.name === 'urlset') {
parsingState.context = 'urlset';
}
if (node.name === 'sitemapindex') {
parsingState.context = 'sitemapindex';
}
});

parser.on('closetag', (name) => {
if (name === 'loc') {
parsingState.loc = false;
}
});

parser.on('text', (text) => {
if (parsingState.loc) {
if (parsingState.context === 'sitemapindex') {
if (!parsingState.visitedSitemapUrls.includes(text)) {
parsingState.sitemapUrls.push(text);
}
}
if (parsingState.context === 'urlset') {
parsingState.urls.push(text);
}
}
});

parser.on('end', onEnd);
parser.on('error', onError);

return parser;
}

/**
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
* @param urls sitemap URL(s)
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static async load(urls: string | string[], proxyUrl?: string): Promise<Sitemap> {
const { gotScraping } = await import('got-scraping');

const parsingState = new ParsingState();
parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls];

while (parsingState.sitemapUrls.length > 0) {
const sitemapUrl = parsingState.sitemapUrls.pop()!;
parsingState.visitedSitemapUrls.push(sitemapUrl);
parsingState.resetContext();

try {
const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>((resolve, reject) => {
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
request.on('response', () => resolve(request));
request.on('error', reject);
});

if (sitemapStream.response!.statusCode === 200) {
await new Promise((resolve, reject) => {
const parser = Sitemap.createParser(parsingState, () => resolve(undefined), reject);
let stream: Duplex = sitemapStream;
if (sitemapUrl.endsWith('.gz')) {
stream = stream.pipe(createGunzip());
}
stream.pipe(parser);
});
}
} catch (e) {
log.warning(`Malformed sitemap content: ${sitemapUrl}`);
}
}

return new Sitemap(parsingState.urls);
}
}
51 changes: 51 additions & 0 deletions packages/utils/test/robots.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import nock from 'nock';
import { describe, expect, it, beforeEach } from 'vitest';

import { RobotsFile } from '../src/internals/robots';

describe('RobotsFile', () => {
beforeEach(() => {
nock.disableNetConnect();
nock('http://not-exists.com').persist()
.get('/robots.txt')
.reply(200, [
'User-agent: *',
'Disallow: *deny_all/',
'crawl-delay: 10',

'User-agent: Googlebot',
'Disallow: *deny_googlebot/',
'crawl-delay: 1',

'user-agent: Mozilla',
'crawl-delay: 2',

'sitemap: http://not-exists.com/sitemap_1.xml',
'sitemap: http://not-exists.com/sitemap_2.xml',
].join('\n'))
.get('*')
.reply(404);
});

afterEach(() => {
nock.cleanAll();
nock.enableNetConnect();
});

it('generates the correct robots.txt URL', async () => {
const robots = await RobotsFile.find('http://not-exists.com/nested/index.html');
expect(robots.getSitemaps()).not.toHaveLength(0);
});

it('parses allow/deny directives from robots.txt', async () => {
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false);
});

it('extracts sitemap urls', async () => {
const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
expect(robots.getSitemaps()).toEqual(['http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml']);
});
});
Loading

0 comments on commit fdfec4f

Please sign in to comment.