Skip to content

Commit

Permalink
Proper merging from other branch
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewZMSU committed Jun 28, 2024
1 parent 205600f commit 8ee03ad
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 53 deletions.
2 changes: 1 addition & 1 deletion scrapypuppeteer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .request import PuppeteerRequest
from .request import PuppeteerRequest, CloseContextRequest
from .response import (
PuppeteerResponse,
PuppeteerHtmlResponse,
Expand Down
19 changes: 0 additions & 19 deletions scrapypuppeteer/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,22 +284,3 @@ def __init__(self, js_action: str):

def payload(self):
return self.js_action


class CloseContext(PuppeteerServiceAction):
"""
Close contexts in the puppeteer-service.
Response for this action is PuppeteerHtmlResponse.
"""

endpoint = "close_context"

def __init__(self, contexts: List):
"""
:param list contexts: Contexts to close.
"""
self.contexts = contexts

def payload(self):
return self.contexts
64 changes: 32 additions & 32 deletions scrapypuppeteer/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
Screenshot,
Scroll,
CustomJsAction,
CloseContext,
)
from scrapypuppeteer.response import (
PuppeteerResponse,
Expand All @@ -27,7 +26,7 @@
PuppeteerRecaptchaSolverResponse,
PuppeteerJsonResponse,
)
from scrapypuppeteer.request import ActionRequest, PuppeteerRequest
from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest


class PuppeteerServiceDownloaderMiddleware:
Expand All @@ -40,7 +39,7 @@ class PuppeteerServiceDownloaderMiddleware:
Additionally, the middleware uses these meta-keys, do not use them, because their changing
could possibly (almost probably) break determined behaviour:
'puppeteer_request', 'dont_obey_robotstxt', 'proxy', '__closing_contexts'
'puppeteer_request', 'dont_obey_robotstxt', 'proxy'
Settings:
Expand All @@ -50,7 +49,7 @@ class PuppeteerServiceDownloaderMiddleware:
PUPPETEER_INCLUDE_HEADERS (bool|list[str])
Determines which request headers will be sent to remote site by puppeteer service.
Either True (all headers), False (no headers) or list of header names.
May be overriden per request.
May be overridden per request.
By default, only cookies are sent.
PUPPETEER_INCLUDE_META (bool)
Expand Down Expand Up @@ -98,9 +97,22 @@ def from_crawler(cls, crawler):
return middleware

def process_request(self, request, spider):
if not isinstance(request, PuppeteerRequest):
return
if isinstance(request, CloseContextRequest):
return self.process_close_context_request(request)

if isinstance(request, PuppeteerRequest):
return self.process_puppeteer_request(request)

def process_close_context_request(self, request: CloseContextRequest):
if not request.is_valid_url:
return request.replace(
url=urljoin(self.service_base_url, "/close_context"),
method="POST",
headers=Headers({"Content-Type": "application/json"}),
body=json.dumps(request.contexts),
)

def process_puppeteer_request(self, request: PuppeteerRequest):
action = request.action
service_url = urljoin(self.service_base_url, action.endpoint)
service_params = self._encode_service_params(request)
Expand Down Expand Up @@ -173,30 +185,16 @@ def process_response(self, request, response, spider):
if puppeteer_request is None:
return response

if b"application/json" in response.headers.get(b"Content-Type", b""):
response_data = json.loads(response.text)
elif isinstance(puppeteer_request.action, CloseContext):
response_data = {
"html": response.text,
"cookies": [],
"contextId": puppeteer_request.context_id,
}
else:
if b"application/json" not in response.headers.get(b"Content-Type", b""):
return response.replace(request=request)

response_data = json.loads(response.text)
if response.status != 200:
context_id = response_data.get("contextId")
if context_id:
self.used_contexts[id(spider)].add(context_id)
return response

if puppeteer_request.meta.get("__closing_contexts", False):
self.service_logger.log(
level=logging.DEBUG,
msg=f"Successfully closed {len(puppeteer_request.action.payload())} contexts with puppeteer request {puppeteer_request}",
)
raise IgnoreRequest()

response_cls = self._get_response_class(puppeteer_request.action)

return self._form_response(
Expand Down Expand Up @@ -227,9 +225,7 @@ def _form_response(

@staticmethod
def _get_response_class(request_action):
if isinstance(
request_action, (GoTo, GoForward, GoBack, Click, Scroll, CloseContext)
):
if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)):
return PuppeteerHtmlResponse
if isinstance(request_action, Screenshot):
return PuppeteerScreenshotResponse
Expand All @@ -240,14 +236,18 @@ def _get_response_class(request_action):
def close_used_contexts(self, spider):
contexts = list(self.used_contexts.pop(id(spider), set()))
if contexts:
request = PuppeteerRequest(
CloseContext(contexts),
close_page=False, # To not write `...?closePage=1` in logs
include_headers=False,
url=self.service_base_url,
meta={"__closing_contexts": True},
request = CloseContextRequest(
contexts,
meta={"proxy": None},
)

dfd = self.crawler.engine.download(request)
dfd.addCallback(
lambda response: self.service_logger.log(
level=logging.DEBUG,
msg=f"Successfully closed {len(request.contexts)} contexts with request {response.request}",
)
)
self.crawler.engine.crawl(request=request)
raise DontCloseSpider()


Expand Down
30 changes: 30 additions & 0 deletions scrapypuppeteer/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,33 @@ def __init__(
self.page_id = page_id
self.close_page = close_page
self.include_headers = include_headers


class CloseContextRequest(Request):
"""
This request is used to close the browser contexts.
The response for this request is a regular Scrapy HTMLResponse.
"""

attributes: Tuple[str, ...] = Request.attributes + ("contexts",)

def __init__(self, contexts: List, **kwargs):
"""
:param contexts: list of puppeteer contexts to close.
:param kwargs: arguments of scrapy.Request.
"""
self.contexts = contexts
self.is_valid_url = False

if "url" in kwargs:
self.is_valid_url = True
url = kwargs.pop("url", "://") # Incorrect url. To be replaced in middleware
super().__init__(url, **kwargs)

def __repr__(self):
return f"<CLOSE CONTEXT {self.url if self.is_valid_url else 'undefined url'}>"

def __str__(self):
return self.__repr__()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name="scrapy-puppeteer-client",
version="0.1.5",
version="0.2.0",
description="A library to use Puppeteer-managed browser in Scrapy spiders",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 8ee03ad

Please sign in to comment.