feat: robots.txt and sitemap.xml utils (#2214)

Closes #2187 --------- Co-authored-by: Jindřich Bär <[email protected]> Co-authored-by: Martin Adámek <[email protected]> Co-authored-by: Vlad Frangu <[email protected]>
apify · Dec 19, 2023 · fdfec4f · fdfec4f
1 parent c1cfd45
commit fdfec4f
Show file tree

Hide file tree

Showing 13 changed files with 462 additions and 16 deletions.
diff --git a/docs/examples/crawl_sitemap.mdx b/docs/examples/crawl_sitemap.mdx
@@ -12,7 +12,7 @@ import CheerioSource from '!!raw-loader!roa-loader!./crawl_sitemap_cheerio.ts';
 import PuppeteerSource from '!!raw-loader!roa-loader!./crawl_sitemap_puppeteer.ts';
 import PlaywrightSource from '!!raw-loader!roa-loader!./crawl_sitemap_playwright.ts';
 
-This example downloads and crawls the URLs from a sitemap, by using the <ApiLink to="utils/function/downloadListOfUrls">`downloadListOfUrls`</ApiLink> utility method provided by the <ApiLink to="utils">`@crawlee/utils`</ApiLink> module.
+This example downloads and crawls the URLs from a sitemap, by using the <ApiLink to="utils/class/Sitemap">`Sitemap`</ApiLink> utility class provided by the <ApiLink to="utils">`@crawlee/utils`</ApiLink> module.
 
 <Tabs groupId="crawler-type">
 

diff --git a/docs/examples/crawl_sitemap_cheerio.ts b/docs/examples/crawl_sitemap_cheerio.ts
@@ -1,4 +1,4 @@
-import { CheerioCrawler, downloadListOfUrls } from 'crawlee';
+import { CheerioCrawler, Sitemap } from 'crawlee';
 
 const crawler = new CheerioCrawler({
     // Function called for each URL
@@ -8,9 +8,9 @@ const crawler = new CheerioCrawler({
     maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
 });
 
-const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
+const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');
 
-await crawler.addRequests(listOfUrls);
+await crawler.addRequests(urls);
 
 // Run the crawler
 await crawler.run();
diff --git a/docs/examples/crawl_sitemap_playwright.ts b/docs/examples/crawl_sitemap_playwright.ts
@@ -1,4 +1,4 @@
-import { PlaywrightCrawler, downloadListOfUrls } from 'crawlee';
+import { PlaywrightCrawler, Sitemap } from 'crawlee';
 
 const crawler = new PlaywrightCrawler({
     // Function called for each URL
@@ -8,9 +8,9 @@ const crawler = new PlaywrightCrawler({
     maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
 });
 
-const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
+const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');
 
-await crawler.addRequests(listOfUrls);
+await crawler.addRequests(urls);
 
 // Run the crawler
 await crawler.run();
diff --git a/docs/examples/crawl_sitemap_puppeteer.ts b/docs/examples/crawl_sitemap_puppeteer.ts
@@ -1,4 +1,4 @@
-import { PuppeteerCrawler, downloadListOfUrls } from 'crawlee';
+import { PuppeteerCrawler, Sitemap } from 'crawlee';
 
 const crawler = new PuppeteerCrawler({
     // Function called for each URL
@@ -8,9 +8,9 @@ const crawler = new PuppeteerCrawler({
     maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
 });
 
-const listOfUrls = await downloadListOfUrls({ url: 'https://crawlee.dev/sitemap.xml' });
+const { urls } = await Sitemap.load('https://crawlee.dev/sitemap.xml');
 
-await crawler.addRequests(listOfUrls);
+await crawler.addRequests(urls);
 
 // Run the crawler
 await crawler.run();
diff --git a/package.json b/package.json
@@ -72,6 +72,7 @@
         "@types/proper-lockfile": "^4.1.2",
         "@types/ps-tree": "^1.1.2",
         "@types/rimraf": "^3.0.2",
+        "@types/sax": "^1.0.0",
         "@types/semver": "^7.3.12",
         "@types/stream-json": "^1.7.2",
         "@types/tough-cookie": "^4.0.2",
@@ -95,6 +96,7 @@
         "is-ci": "^3.0.1",
         "lerna": "^8.0.0",
         "lint-staged": "^15.0.0",
+        "nock": "^13.4.0",
         "playwright": "1.40.1",
         "portastic": "^1.0.1",
         "proxy": "^1.0.2",

diff --git a/packages/utils/package.json b/packages/utils/package.json
@@ -51,8 +51,10 @@
         "@apify/ps-tree": "^1.2.0",
         "@crawlee/types": "3.6.2",
         "cheerio": "^1.0.0-rc.12",
-        "got-scraping": "^4.0.0",
+        "got-scraping": "^4.0.3",
         "ow": "^0.28.1",
+        "robots-parser": "^3.0.1",
+        "sax": "^1.3.0",
         "tslib": "^2.4.0"
     }
 }
diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts
@@ -10,5 +10,7 @@ export * from './internals/typedefs';
 export * from './internals/error_tracker';
 export * from './internals/open_graph_parser';
 export * from './internals/gotScraping';
+export * from './internals/robots';
+export * from './internals/sitemap';
 
 export { Dictionary, Awaitable, Constructor } from '@crawlee/types';
diff --git a/packages/utils/src/internals/robots.ts b/packages/utils/src/internals/robots.ts
@@ -0,0 +1,98 @@
+// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
+import type { HTTPError as HTTPErrorClass } from 'got-scraping';
+import type { Robot } from 'robots-parser';
+import robotsParser from 'robots-parser';
+
+import { gotScraping } from './gotScraping';
+import { Sitemap } from './sitemap';
+
+let HTTPError: typeof HTTPErrorClass;
+
+/**
+ * Loads and queries information from a robots.txt file.
+ *
+ * **Example usage:**
+ * ```javascript
+ * // Load the robots.txt file
+ * const robots = await RobotsFile.load('https://crawlee.dev/docs/introduction/first-crawler');
+ *
+ * // Check if a URL should be crawled according to robots.txt
+ * const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
+ * if (robots.isAllowed(url)) {
+ *   await crawler.addRequests([url]);
+ * }
+ *
+ * // Enqueue all links in the sitemap(s)
+ * await crawler.addRequests(await robots.parseUrlsFromSitemaps());
+ * ```
+ */
+export class RobotsFile {
+    private constructor(
+        private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>,
+        private proxyUrl?: string,
+    ) {}
+
+    /**
+     * Determine the location of a robots.txt file for a URL and fetch it.
+     * @param url the URL to fetch robots.txt for
+     * @param proxyUrl a proxy to be used for fetching the robots.txt file
+     */
+    static async find(url: string, proxyUrl?: string): Promise<RobotsFile> {
+        const robotsFileUrl = new URL(url);
+        robotsFileUrl.pathname = '/robots.txt';
+        robotsFileUrl.search = '';
+
+        return RobotsFile.load(robotsFileUrl.toString(), proxyUrl);
+    }
+
+    protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> {
+        if (!HTTPError) {
+            HTTPError = (await import('got-scraping')).HTTPError;
+        }
+
+        try {
+            const response = await gotScraping({
+                url,
+                proxyUrl,
+                method: 'GET',
+                responseType: 'text',
+            });
+
+            return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl);
+        } catch (e) {
+            if (e instanceof HTTPError && e.response.statusCode === 404) {
+                return new RobotsFile({ isAllowed() { return true; }, getSitemaps() { return []; } }, proxyUrl);
+            }
+            throw e;
+        }
+    }
+
+    /**
+     * Check if a URL should be crawled by robots.
+     * @param url the URL to check against the rules in robots.txt
+     */
+    isAllowed(url: string): boolean {
+        return this.robots.isAllowed(url, '*') ?? false;
+    }
+
+    /**
+     * Get URLs of sitemaps referenced in the robots file.
+     */
+    getSitemaps(): string[] {
+        return this.robots.getSitemaps();
+    }
+
+    /**
+     * Parse all the sitemaps referenced in the robots file.
+     */
+    async parseSitemaps(): Promise<Sitemap> {
+        return Sitemap.load(this.robots.getSitemaps(), this.proxyUrl);
+    }
+
+    /**
+     * Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`.
+     */
+    async parseUrlsFromSitemaps(): Promise<string[]> {
+        return (await this.parseSitemaps()).urls;
+    }
+}
diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts
@@ -0,0 +1,116 @@
+import type { Duplex } from 'node:stream';
+import { createGunzip } from 'node:zlib';
+
+import log from '@apify/log';
+import type { SAXStream } from 'sax';
+import sax from 'sax';
+
+class ParsingState {
+    sitemapUrls: string[] = [];
+    urls: string[] = [];
+    visitedSitemapUrls: string[] = [];
+    context?: 'sitemapindex' | 'urlset';
+    loc = false;
+
+    resetContext() {
+        this.context = undefined;
+        this.loc = false;
+    }
+}
+
+/**
+ * Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs.
+ *
+ * **Example usage:**
+ * ```javascript
+ * // Load a sitemap
+ * const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']);
+ *
+ * // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes)
+ * await crawler.addRequests(sitemap.urls);
+ * ```
+ */
+export class Sitemap {
+    constructor(readonly urls: string[]) {}
+
+    protected static createParser(parsingState: ParsingState, onEnd: () => void, onError: (error: Error) => void): SAXStream {
+        const parser = sax.createStream(true);
+
+        parser.on('opentag', (node) => {
+            if (node.name === 'loc' && parsingState.context !== undefined) {
+                parsingState.loc = true;
+            }
+            if (node.name === 'urlset') {
+                parsingState.context = 'urlset';
+            }
+            if (node.name === 'sitemapindex') {
+                parsingState.context = 'sitemapindex';
+            }
+        });
+
+        parser.on('closetag', (name) => {
+            if (name === 'loc') {
+                parsingState.loc = false;
+            }
+        });
+
+        parser.on('text', (text) => {
+            if (parsingState.loc) {
+                if (parsingState.context === 'sitemapindex') {
+                    if (!parsingState.visitedSitemapUrls.includes(text)) {
+                        parsingState.sitemapUrls.push(text);
+                    }
+                }
+                if (parsingState.context === 'urlset') {
+                    parsingState.urls.push(text);
+                }
+            }
+        });
+
+        parser.on('end', onEnd);
+        parser.on('error', onError);
+
+        return parser;
+    }
+
+    /**
+     * Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
+     * @param urls sitemap URL(s)
+     * @param proxyUrl URL of a proxy to be used for fetching sitemap contents
+     */
+    static async load(urls: string | string[], proxyUrl?: string): Promise<Sitemap> {
+        const { gotScraping } = await import('got-scraping');
+
+        const parsingState = new ParsingState();
+        parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls];
+
+        while (parsingState.sitemapUrls.length > 0) {
+            const sitemapUrl = parsingState.sitemapUrls.pop()!;
+            parsingState.visitedSitemapUrls.push(sitemapUrl);
+            parsingState.resetContext();
+
+            try {
+                const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>((resolve, reject) => {
+                    const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
+                    request.on('response', () => resolve(request));
+                    request.on('error', reject);
+                });
+
+                if (sitemapStream.response!.statusCode === 200) {
+                    await new Promise((resolve, reject) => {
+                        const parser = Sitemap.createParser(parsingState, () => resolve(undefined), reject);
+                        let stream: Duplex = sitemapStream;
+                        if (sitemapUrl.endsWith('.gz')) {
+                            stream = stream.pipe(createGunzip());
+                        }
+                        stream.pipe(parser);
+                    });
+                }
+            } catch (e) {
+                log.warning(`Malformed sitemap content: ${sitemapUrl}`);
+            }
+        }
+
+        return new Sitemap(parsingState.urls);
+    }
+}
diff --git a/packages/utils/test/robots.test.ts b/packages/utils/test/robots.test.ts
@@ -0,0 +1,51 @@
+import nock from 'nock';
+import { describe, expect, it, beforeEach } from 'vitest';
+
+import { RobotsFile } from '../src/internals/robots';
+
+describe('RobotsFile', () => {
+    beforeEach(() => {
+        nock.disableNetConnect();
+        nock('http://not-exists.com').persist()
+            .get('/robots.txt')
+            .reply(200, [
+                'User-agent: *',
+                'Disallow: *deny_all/',
+                'crawl-delay: 10',
+
+                'User-agent: Googlebot',
+                'Disallow: *deny_googlebot/',
+                'crawl-delay: 1',
+
+                'user-agent: Mozilla',
+                'crawl-delay: 2',
+
+                'sitemap: http://not-exists.com/sitemap_1.xml',
+                'sitemap: http://not-exists.com/sitemap_2.xml',
+            ].join('\n'))
+            .get('*')
+            .reply(404);
+    });
+
+    afterEach(() => {
+        nock.cleanAll();
+        nock.enableNetConnect();
+    });
+
+    it('generates the correct robots.txt URL', async () => {
+        const robots = await RobotsFile.find('http://not-exists.com/nested/index.html');
+        expect(robots.getSitemaps()).not.toHaveLength(0);
+    });
+
+    it('parses allow/deny directives from robots.txt', async () => {
+        const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
+        expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true);
+        expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true);
+        expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false);
+    });
+
+    it('extracts sitemap urls', async () => {
+        const robots = await RobotsFile.find('http://not-exists.com/robots.txt');
+        expect(robots.getSitemaps()).toEqual(['http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml']);
+    });
+});