-
Notifications
You must be signed in to change notification settings - Fork 737
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: robots.txt and sitemap.xml utils (#2214)
Closes #2187 --------- Co-authored-by: Jindřich Bär <[email protected]> Co-authored-by: Martin Adámek <[email protected]> Co-authored-by: Vlad Frangu <[email protected]>
- Loading branch information
1 parent
c1cfd45
commit fdfec4f
Showing
13 changed files
with
462 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood | ||
import type { HTTPError as HTTPErrorClass } from 'got-scraping'; | ||
import type { Robot } from 'robots-parser'; | ||
import robotsParser from 'robots-parser'; | ||
|
||
import { gotScraping } from './gotScraping'; | ||
import { Sitemap } from './sitemap'; | ||
|
||
let HTTPError: typeof HTTPErrorClass; | ||
|
||
/** | ||
* Loads and queries information from a robots.txt file. | ||
* | ||
* **Example usage:** | ||
* ```javascript | ||
* // Load the robots.txt file | ||
* const robots = await RobotsFile.load('https://crawlee.dev/docs/introduction/first-crawler'); | ||
* | ||
* // Check if a URL should be crawled according to robots.txt | ||
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler'; | ||
* if (robots.isAllowed(url)) { | ||
* await crawler.addRequests([url]); | ||
* } | ||
* | ||
* // Enqueue all links in the sitemap(s) | ||
* await crawler.addRequests(await robots.parseUrlsFromSitemaps()); | ||
* ``` | ||
*/ | ||
export class RobotsFile { | ||
private constructor( | ||
private robots: Pick<Robot, 'isAllowed' | 'getSitemaps'>, | ||
private proxyUrl?: string, | ||
) {} | ||
|
||
/** | ||
* Determine the location of a robots.txt file for a URL and fetch it. | ||
* @param url the URL to fetch robots.txt for | ||
* @param proxyUrl a proxy to be used for fetching the robots.txt file | ||
*/ | ||
static async find(url: string, proxyUrl?: string): Promise<RobotsFile> { | ||
const robotsFileUrl = new URL(url); | ||
robotsFileUrl.pathname = '/robots.txt'; | ||
robotsFileUrl.search = ''; | ||
|
||
return RobotsFile.load(robotsFileUrl.toString(), proxyUrl); | ||
} | ||
|
||
protected static async load(url: string, proxyUrl?: string): Promise<RobotsFile> { | ||
if (!HTTPError) { | ||
HTTPError = (await import('got-scraping')).HTTPError; | ||
} | ||
|
||
try { | ||
const response = await gotScraping({ | ||
url, | ||
proxyUrl, | ||
method: 'GET', | ||
responseType: 'text', | ||
}); | ||
|
||
return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl); | ||
} catch (e) { | ||
if (e instanceof HTTPError && e.response.statusCode === 404) { | ||
return new RobotsFile({ isAllowed() { return true; }, getSitemaps() { return []; } }, proxyUrl); | ||
} | ||
throw e; | ||
} | ||
} | ||
|
||
/** | ||
* Check if a URL should be crawled by robots. | ||
* @param url the URL to check against the rules in robots.txt | ||
*/ | ||
isAllowed(url: string): boolean { | ||
return this.robots.isAllowed(url, '*') ?? false; | ||
} | ||
|
||
/** | ||
* Get URLs of sitemaps referenced in the robots file. | ||
*/ | ||
getSitemaps(): string[] { | ||
return this.robots.getSitemaps(); | ||
} | ||
|
||
/** | ||
* Parse all the sitemaps referenced in the robots file. | ||
*/ | ||
async parseSitemaps(): Promise<Sitemap> { | ||
return Sitemap.load(this.robots.getSitemaps(), this.proxyUrl); | ||
} | ||
|
||
/** | ||
* Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`. | ||
*/ | ||
async parseUrlsFromSitemaps(): Promise<string[]> { | ||
return (await this.parseSitemaps()).urls; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import type { Duplex } from 'node:stream'; | ||
import { createGunzip } from 'node:zlib'; | ||
|
||
import log from '@apify/log'; | ||
import type { SAXStream } from 'sax'; | ||
import sax from 'sax'; | ||
|
||
class ParsingState { | ||
sitemapUrls: string[] = []; | ||
urls: string[] = []; | ||
visitedSitemapUrls: string[] = []; | ||
context?: 'sitemapindex' | 'urlset'; | ||
loc = false; | ||
|
||
resetContext() { | ||
this.context = undefined; | ||
this.loc = false; | ||
} | ||
} | ||
|
||
/** | ||
* Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs. | ||
* | ||
* **Example usage:** | ||
* ```javascript | ||
* // Load a sitemap | ||
* const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']); | ||
* | ||
* // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes) | ||
* await crawler.addRequests(sitemap.urls); | ||
* ``` | ||
*/ | ||
export class Sitemap { | ||
constructor(readonly urls: string[]) {} | ||
|
||
protected static createParser(parsingState: ParsingState, onEnd: () => void, onError: (error: Error) => void): SAXStream { | ||
const parser = sax.createStream(true); | ||
|
||
parser.on('opentag', (node) => { | ||
if (node.name === 'loc' && parsingState.context !== undefined) { | ||
parsingState.loc = true; | ||
} | ||
if (node.name === 'urlset') { | ||
parsingState.context = 'urlset'; | ||
} | ||
if (node.name === 'sitemapindex') { | ||
parsingState.context = 'sitemapindex'; | ||
} | ||
}); | ||
|
||
parser.on('closetag', (name) => { | ||
if (name === 'loc') { | ||
parsingState.loc = false; | ||
} | ||
}); | ||
|
||
parser.on('text', (text) => { | ||
if (parsingState.loc) { | ||
if (parsingState.context === 'sitemapindex') { | ||
if (!parsingState.visitedSitemapUrls.includes(text)) { | ||
parsingState.sitemapUrls.push(text); | ||
} | ||
} | ||
if (parsingState.context === 'urlset') { | ||
parsingState.urls.push(text); | ||
} | ||
} | ||
}); | ||
|
||
parser.on('end', onEnd); | ||
parser.on('error', onError); | ||
|
||
return parser; | ||
} | ||
|
||
/** | ||
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages. | ||
* @param urls sitemap URL(s) | ||
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents | ||
*/ | ||
static async load(urls: string | string[], proxyUrl?: string): Promise<Sitemap> { | ||
const { gotScraping } = await import('got-scraping'); | ||
|
||
const parsingState = new ParsingState(); | ||
parsingState.sitemapUrls = Array.isArray(urls) ? urls : [urls]; | ||
|
||
while (parsingState.sitemapUrls.length > 0) { | ||
const sitemapUrl = parsingState.sitemapUrls.pop()!; | ||
parsingState.visitedSitemapUrls.push(sitemapUrl); | ||
parsingState.resetContext(); | ||
|
||
try { | ||
const sitemapStream = await new Promise<ReturnType<typeof gotScraping.stream>>((resolve, reject) => { | ||
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' }); | ||
request.on('response', () => resolve(request)); | ||
request.on('error', reject); | ||
}); | ||
|
||
if (sitemapStream.response!.statusCode === 200) { | ||
await new Promise((resolve, reject) => { | ||
const parser = Sitemap.createParser(parsingState, () => resolve(undefined), reject); | ||
let stream: Duplex = sitemapStream; | ||
if (sitemapUrl.endsWith('.gz')) { | ||
stream = stream.pipe(createGunzip()); | ||
} | ||
stream.pipe(parser); | ||
}); | ||
} | ||
} catch (e) { | ||
log.warning(`Malformed sitemap content: ${sitemapUrl}`); | ||
} | ||
} | ||
|
||
return new Sitemap(parsingState.urls); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import nock from 'nock'; | ||
import { describe, expect, it, beforeEach } from 'vitest'; | ||
|
||
import { RobotsFile } from '../src/internals/robots'; | ||
|
||
describe('RobotsFile', () => { | ||
beforeEach(() => { | ||
nock.disableNetConnect(); | ||
nock('http://not-exists.com').persist() | ||
.get('/robots.txt') | ||
.reply(200, [ | ||
'User-agent: *', | ||
'Disallow: *deny_all/', | ||
'crawl-delay: 10', | ||
|
||
'User-agent: Googlebot', | ||
'Disallow: *deny_googlebot/', | ||
'crawl-delay: 1', | ||
|
||
'user-agent: Mozilla', | ||
'crawl-delay: 2', | ||
|
||
'sitemap: http://not-exists.com/sitemap_1.xml', | ||
'sitemap: http://not-exists.com/sitemap_2.xml', | ||
].join('\n')) | ||
.get('*') | ||
.reply(404); | ||
}); | ||
|
||
afterEach(() => { | ||
nock.cleanAll(); | ||
nock.enableNetConnect(); | ||
}); | ||
|
||
it('generates the correct robots.txt URL', async () => { | ||
const robots = await RobotsFile.find('http://not-exists.com/nested/index.html'); | ||
expect(robots.getSitemaps()).not.toHaveLength(0); | ||
}); | ||
|
||
it('parses allow/deny directives from robots.txt', async () => { | ||
const robots = await RobotsFile.find('http://not-exists.com/robots.txt'); | ||
expect(robots.isAllowed('http://not-exists.com/something/page.html')).toBe(true); | ||
expect(robots.isAllowed('http://not-exists.com/deny_googlebot/page.html')).toBe(true); | ||
expect(robots.isAllowed('http://not-exists.com/deny_all/page.html')).toBe(false); | ||
}); | ||
|
||
it('extracts sitemap urls', async () => { | ||
const robots = await RobotsFile.find('http://not-exists.com/robots.txt'); | ||
expect(robots.getSitemaps()).toEqual(['http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml']); | ||
}); | ||
}); |
Oops, something went wrong.