diff --git a/scrapypuppeteer/__init__.py b/scrapypuppeteer/__init__.py index 066a158..4507597 100644 --- a/scrapypuppeteer/__init__.py +++ b/scrapypuppeteer/__init__.py @@ -1,4 +1,4 @@ -from .request import PuppeteerRequest +from .request import PuppeteerRequest, CloseContextRequest from .response import ( PuppeteerResponse, PuppeteerHtmlResponse, diff --git a/scrapypuppeteer/actions.py b/scrapypuppeteer/actions.py index 0d5821f..18bd914 100644 --- a/scrapypuppeteer/actions.py +++ b/scrapypuppeteer/actions.py @@ -284,22 +284,3 @@ def __init__(self, js_action: str): def payload(self): return self.js_action - - -class CloseContext(PuppeteerServiceAction): - """ - Close contexts in the puppeteer-service. - - Response for this action is PuppeteerHtmlResponse. - """ - - endpoint = "close_context" - - def __init__(self, contexts: List): - """ - :param list contexts: Contexts to close. - """ - self.contexts = contexts - - def payload(self): - return self.contexts diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 2ef2fd1..232f3d6 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -18,7 +18,6 @@ Screenshot, Scroll, CustomJsAction, - CloseContext, ) from scrapypuppeteer.response import ( PuppeteerResponse, @@ -27,7 +26,7 @@ PuppeteerRecaptchaSolverResponse, PuppeteerJsonResponse, ) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest class PuppeteerServiceDownloaderMiddleware: @@ -40,7 +39,7 @@ class PuppeteerServiceDownloaderMiddleware: Additionally, the middleware uses these meta-keys, do not use them, because their changing could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy', '__closing_contexts' + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' Settings: @@ -50,7 +49,7 @@ class PuppeteerServiceDownloaderMiddleware: PUPPETEER_INCLUDE_HEADERS (bool|list[str]) Determines which request headers will be sent to remote site by puppeteer service. Either True (all headers), False (no headers) or list of header names. - May be overriden per request. + May be overridden per request. By default, only cookies are sent. PUPPETEER_INCLUDE_META (bool) @@ -98,9 +97,22 @@ def from_crawler(cls, crawler): return middleware def process_request(self, request, spider): - if not isinstance(request, PuppeteerRequest): - return + if isinstance(request, CloseContextRequest): + return self.process_close_context_request(request) + if isinstance(request, PuppeteerRequest): + return self.process_puppeteer_request(request) + + def process_close_context_request(self, request: CloseContextRequest): + if not request.is_valid_url: + return request.replace( + url=urljoin(self.service_base_url, "/close_context"), + method="POST", + headers=Headers({"Content-Type": "application/json"}), + body=json.dumps(request.contexts), + ) + + def process_puppeteer_request(self, request: PuppeteerRequest): action = request.action service_url = urljoin(self.service_base_url, action.endpoint) service_params = self._encode_service_params(request) @@ -173,30 +185,16 @@ def process_response(self, request, response, spider): if puppeteer_request is None: return response - if b"application/json" in response.headers.get(b"Content-Type", b""): - response_data = json.loads(response.text) - elif isinstance(puppeteer_request.action, CloseContext): - response_data = { - "html": response.text, - "cookies": [], - "contextId": puppeteer_request.context_id, - } - else: + if b"application/json" not in response.headers.get(b"Content-Type", b""): return response.replace(request=request) + response_data = json.loads(response.text) if response.status != 200: context_id = response_data.get("contextId") if context_id: self.used_contexts[id(spider)].add(context_id) return response - if puppeteer_request.meta.get("__closing_contexts", False): - self.service_logger.log( - level=logging.DEBUG, - msg=f"Successfully closed {len(puppeteer_request.action.payload())} contexts with puppeteer request {puppeteer_request}", - ) - raise IgnoreRequest() - response_cls = self._get_response_class(puppeteer_request.action) return self._form_response( @@ -227,9 +225,7 @@ def _form_response( @staticmethod def _get_response_class(request_action): - if isinstance( - request_action, (GoTo, GoForward, GoBack, Click, Scroll, CloseContext) - ): + if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): return PuppeteerHtmlResponse if isinstance(request_action, Screenshot): return PuppeteerScreenshotResponse @@ -240,14 +236,18 @@ def _get_response_class(request_action): def close_used_contexts(self, spider): contexts = list(self.used_contexts.pop(id(spider), set())) if contexts: - request = PuppeteerRequest( - CloseContext(contexts), - close_page=False, # To not write `...?closePage=1` in logs - include_headers=False, - url=self.service_base_url, - meta={"__closing_contexts": True}, + request = CloseContextRequest( + contexts, + meta={"proxy": None}, + ) + + dfd = self.crawler.engine.download(request) + dfd.addCallback( + lambda response: self.service_logger.log( + level=logging.DEBUG, + msg=f"Successfully closed {len(request.contexts)} contexts with request {response.request}", + ) ) - self.crawler.engine.crawl(request=request) raise DontCloseSpider() diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index 4ffc477..08a9c4d 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -94,3 +94,33 @@ def __init__( self.page_id = page_id self.close_page = close_page self.include_headers = include_headers + + +class CloseContextRequest(Request): + """ + This request is used to close the browser contexts. + + The response for this request is a regular Scrapy HTMLResponse. + """ + + attributes: Tuple[str, ...] = Request.attributes + ("contexts",) + + def __init__(self, contexts: List, **kwargs): + """ + :param contexts: list of puppeteer contexts to close. + + :param kwargs: arguments of scrapy.Request. + """ + self.contexts = contexts + self.is_valid_url = False + + if "url" in kwargs: + self.is_valid_url = True + url = kwargs.pop("url", "://") # Incorrect url. To be replaced in middleware + super().__init__(url, **kwargs) + + def __repr__(self): + return f"" + + def __str__(self): + return self.__repr__() diff --git a/setup.py b/setup.py index bc0ce3d..8502e3c 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="scrapy-puppeteer-client", - version="0.1.5", + version="0.2.0", description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=long_description, long_description_content_type="text/markdown",