From f41d5faedadb1cd496e21c2fa61d65413d035f3b Mon Sep 17 00:00:00 2001 From: Matvey Date: Wed, 23 Oct 2024 20:48:25 +0300 Subject: [PATCH] Improve follow/follow_all (#41) * Follow/follow_all * Documentation * Formatter and linter * Priority fix * Comments * Example and fix * Docstring * Docstring * ban any action except GoTo * fix page_id = None * Add Compose to except * Fix action validation * Fix action validation * Response's state is saved now --- README.md | 23 ++++------- examples/spiders/follow.py | 49 ++++++++++++++++++++++++ scrapypuppeteer/request.py | 4 +- scrapypuppeteer/response.py | 76 ++++++++++++++++++++++++++++++++++--- 4 files changed, 129 insertions(+), 23 deletions(-) create mode 100644 examples/spiders/follow.py diff --git a/README.md b/README.md index 53769d4..7daee5a 100644 --- a/README.md +++ b/README.md @@ -23,29 +23,18 @@ DOWNLOADER_MIDDLEWARES = { 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 } -PUPPETEER_SERVICE_URL = 'http://localhost:3000' +PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods # To change the execution method, you must add the corresponding setting: EXECUTION_METHOD = "Puppeteer" ``` Available methods: `Puppeteer`, `Pyppeteer`, `Playwright` -The `Pyppeteer` and `Playwright` methods do not require a running service. They use the pyppeteer and playwright libraries for Python to interact with the browser. Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods. +`Pyppeteer` and `Playwright` methods do not require a running service. +They use the pyppeteer and playwright libraries for Python to interact with the browser. +Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods. -To use the `Pyppeteer` or `Playwright` methods you need to install Chromium. - - -## Configuration - -You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started. -Then add its URL to `settings.py` and enable puppeteer downloader middleware: -```python -DOWNLOADER_MIDDLEWARES = { - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 -} - -PUPPETEER_SERVICE_URL = 'http://localhost:3000' -``` +To use `Pyppeteer` or `Playwright` methods you need to install Chromium. ## Basic usage @@ -130,6 +119,8 @@ class MySpider(scrapy.Spider): ) ``` +You may also use `follow_all` method to continue interacting. + On your first request service will create new incognito browser context and new page in it. Their ids will be in returned in response object as `context_id` and `page_id` attributes. Following such response means passing context and page ids to next request. diff --git a/examples/spiders/follow.py b/examples/spiders/follow.py new file mode 100644 index 0000000..3986e99 --- /dev/null +++ b/examples/spiders/follow.py @@ -0,0 +1,49 @@ +from scrapy import Spider +from scrapy.http import Response + +from scrapypuppeteer import GoTo, PuppeteerRequest, PuppeteerResponse + + +class FollowSpider(Spider): + name = "follow" + + start_urls = ["http://quotes.toscrape.com/page/1/"] + + def start_requests(self): + for url in self.start_urls: + yield PuppeteerRequest( + GoTo(url), + close_page=False, + callback=self.goto_about, + errback=self.errback, + ) + + def goto_about(self, response: PuppeteerResponse): + # yield response.follow( + # response.css("div.quote span a")[0], + # callback=self.parse, + # errback=self.errback, + # close_page=False, + # ) + + # Or: + yield from response.follow_all( + response.css("div.quote span a"), + callback=self.parse, + errback=self.errback, + close_page=True, + ) + + # Or: + # yield from response.follow_all( + # css="div.quote span a", + # callback=self.parse, + # errback=self.errback, + # close_page=False, + # ) + + def parse(self, response: Response, **kwargs): + self.log(response.url.split("/")[-1]) + + def errback(self, failure): + self.log(failure) diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index b64f69d..628e36c 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -92,7 +92,9 @@ def __init__( if isinstance(action.actions[0], GoTo): url = action.actions[0].url elif not isinstance(action, PuppeteerServiceAction): - raise ValueError("Undefined browser action") + raise TypeError( + f"Undefined browser action: `{type(action)}`. `Expected PuppeteerServiceAction`" + ) if url is None: raise ValueError( "Request is not a goto-containing request and does not follow a response" diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 51dee13..20e3843 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -1,11 +1,14 @@ import warnings -from typing import Tuple, Union +from typing import Generator, Tuple, Union +import parsel from scrapy.exceptions import ScrapyDeprecationWarning -from scrapy.http import TextResponse +from scrapy.http import HtmlResponse, TextResponse +from scrapy.http.response.text import _url_from_selector +from scrapy.link import Link from scrapypuppeteer import PuppeteerRequest -from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction +from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction class PuppeteerResponse(TextResponse): @@ -38,7 +41,7 @@ def __init__( def follow( self, - action: Union[str, PuppeteerServiceAction], + action: Union[str, parsel.Selector, Link, PuppeteerServiceAction], close_page=True, accumulate_meta: bool = False, **kwargs, @@ -55,6 +58,10 @@ def follow( page_id = None if self.puppeteer_request.close_page else self.page_id if isinstance(action, str): action = self.urljoin(action) + elif isinstance(action, parsel.Selector): + action = self.urljoin(_url_from_selector(action)) + elif isinstance(action, Link): + action = self.urljoin(action.url) elif isinstance(action, GoTo): action.url = self.urljoin(action.url) else: @@ -70,14 +77,71 @@ def follow( **kwargs, ) + def follow_all( + self, + actions=None, + close_page: bool = True, + accumulate_meta: bool = False, + css=None, + xpath=None, + **kwargs, + ) -> Generator[PuppeteerRequest, None, None]: + """ + Execute actions in the same context but in other browser pages. + Only one of `actions`, `css`, or `xpath` must be specified.` + Note that original page from which the method was called lasts unaffected. -class PuppeteerHtmlResponse(PuppeteerResponse): + :param actions: iterable of PuppeteerActions or selectors + :param close_page: whether to close page after request completion + :param accumulate_meta: whether to accumulate meta from response + :param css: selector + :param xpath: selector + :return: Iterable[PuppeteerRequest] + """ + + arguments = [x for x in (actions, css, xpath) if x is not None] + if len(arguments) != 1: + raise ValueError( + "Please supply exactly one of the following arguments: actions, css, xpath" + ) + if not actions: + if css: + actions = self.css(css) + if xpath: + actions = self.xpath(xpath) + else: + # Ban any PuppeteerAction except GoTo and GoTo-like Compose + for action in actions: + if isinstance(action, PuppeteerServiceAction): + if isinstance(action, Compose): + action = action.actions[0] + if not isinstance(action, GoTo): + raise TypeError(f"Expected GoTo, got {type(action)}") + + page_id = self.page_id + for action in actions: + self.page_id = None # Substitution of page_id in order to create new page + try: + next_request = self.follow( + action, + close_page=close_page, + accumulate_meta=accumulate_meta, + **kwargs, + ) + finally: # To save the original state of response + self.page_id = page_id + yield next_request + + +class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): """ scrapy.TextResponse capturing state of a page in browser. Additionally, exposes received html and cookies via corresponding attributes. """ - attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("html", "cookies") + attributes: Tuple[str, ...] = tuple( + set(PuppeteerResponse.attributes + HtmlResponse.attributes) + ) + ("html", "cookies") """ A tuple of :class:`str` objects containing the name of all public attributes of the class that are also keyword parameters of the