forked from thomasdondorf/puppeteer-cluster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalexa-1m.js
45 lines (37 loc) · 1.42 KB
/
alexa-1m.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// You need to download the Alexa 1M from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
// and unzip it into this directory
const { Cluster } = require('../dist');
const fs = require('fs');
const util = require('util');
const readFile = util.promisify(fs.readFile);
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
monitor: true,
});
// Extracts document.title of the crawled pages
await cluster.task(async ({ page, data: url }) => {
await page.goto(url, { waitUntil: 'domcontentloaded' });
const pageTitle = await page.evaluate(() => document.title);
console.log(`Page title of ${url} is ${pageTitle}`);
});
// In case of problems, log them
cluster.on('taskerror', (err, data) => {
console.log(` Error crawling ${data}: ${err.message}`);
});
// Read the top-1m.csv file from the current directory
const csvFile = await readFile(__dirname + '/top-1m.csv', 'utf8');
const lines = csvFile.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const splitterIndex = line.indexOf(',');
if (splitterIndex !== -1) {
const domain = line.substr(splitterIndex + 1);
// queue the domain
cluster.queue('http://www.' + domain);
}
}
await cluster.idle();
await cluster.close();
})();