forked from shioju/crawlee-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrouter.js
179 lines (179 loc) · 5.55 KB
/
router.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Router = void 0;
const errors_1 = require("./errors");
const defaultRoute = Symbol('default-route');
/**
* Simple router that works based on request labels. This instance can then serve as a `requestHandler` of your crawler.
*
* ```ts
* import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee';
*
* const router = Router.create<CheerioCrawlingContext>();
*
* // we can also use factory methods for specific crawling contexts, the above equals to:
* // import { createCheerioRouter } from 'crawlee';
* // const router = createCheerioRouter();
*
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new CheerioCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*
* Alternatively we can use the default router instance from crawler object:
*
* ```ts
* import { CheerioCrawler } from 'crawlee';
*
* const crawler = new CheerioCrawler();
*
* crawler.router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* crawler.router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* await crawler.run();
* ```
*
* For convenience, we can also define the routes right when creating the router:
*
* ```ts
* import { CheerioCrawler, createCheerioRouter } from 'crawlee';
* const crawler = new CheerioCrawler({
* requestHandler: createCheerioRouter({
* 'label-a': async (ctx) => { ... },
* 'label-b': async (ctx) => { ... },
* })},
* });
* await crawler.run();
* ```
*
* Middlewares are also supported via the `router.use` method. There can be multiple
* middlewares for a single router, they will be executed sequentially in the same
* order as they were registered.
*
* ```ts
* crawler.router.use(async (ctx) => {
* ctx.log.info('...');
* });
* ```
*/
class Router {
/**
* use Router.create() instead!
* @ignore
*/
constructor() {
Object.defineProperty(this, "routes", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
Object.defineProperty(this, "middlewares", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
}
/**
* Registers new route handler for given label.
*/
addHandler(label, handler) {
this.validate(label);
this.routes.set(label, handler);
}
/**
* Registers default route handler.
*/
addDefaultHandler(handler) {
this.validate(defaultRoute);
this.routes.set(defaultRoute, handler);
}
/**
* Registers a middleware that will be fired before the matching route handler.
* Multiple middlewares can be registered, they will be fired in the same order.
*/
use(middleware) {
this.middlewares.push(middleware);
}
/**
* Returns route handler for given label. If no label is provided, the default request handler will be returned.
*/
getHandler(label) {
if (label && this.routes.has(label)) {
return this.routes.get(label);
}
if (this.routes.has(defaultRoute)) {
return this.routes.get(defaultRoute);
}
throw new errors_1.MissingRouteError(`Route not found for label '${String(label)}'.` +
' You must set up a route for this label or a default route.' +
' Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.');
}
/**
* Throws when the label already exists in our registry.
*/
validate(label) {
if (this.routes.has(label)) {
const message = label === defaultRoute
? `Default route is already defined!`
: `Route for label '${String(label)}' is already defined!`;
throw new Error(message);
}
}
/**
* Creates new router instance. This instance can then serve as a `requestHandler` of your crawler.
*
* ```ts
* import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee';
*
* const router = Router.create<CheerioCrawlingContext>();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new CheerioCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
static create(routes) {
const router = new Router();
const obj = Object.create(Function.prototype);
obj.addHandler = router.addHandler.bind(router);
obj.addDefaultHandler = router.addDefaultHandler.bind(router);
obj.getHandler = router.getHandler.bind(router);
obj.use = router.use.bind(router);
for (const [label, handler] of Object.entries(routes ?? {})) {
router.addHandler(label, handler);
}
const func = async function (context) {
const { url, loadedUrl, label } = context.request;
context.log.debug('Page opened.', { label, url: loadedUrl ?? url });
for (const middleware of router.middlewares) {
await middleware(context);
}
return router.getHandler(label)(context);
};
Object.setPrototypeOf(func, obj);
return func;
}
}
exports.Router = Router;
//# sourceMappingURL=router.js.map