Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into cntx-restore
Browse files Browse the repository at this point in the history
# Conflicts:
#	scrapypuppeteer/middleware.py
  • Loading branch information
MatthewZMSU committed Oct 22, 2024
2 parents 6656461 + 98704eb commit a5bcf16
Show file tree
Hide file tree
Showing 25 changed files with 272 additions and 81 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Ruff Code Check

on: [push, pull_request]

jobs:
ruff:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.x"

- name: Install Ruff
run: |
pip install ruff
- name: Run Ruff Format
run: |
ruff format --check
- name: Run Ruff Check
run: |
ruff check .
8 changes: 8 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ Examples of contributions include:

`Please formalize your pull request (PR)` you will get.

**Before each push or PR, run in the root directory of the project:**

```bash
ruff check

ruff format
```

---
# Code of Conduct

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ Here is the list of available actions:
- `GoForward(options)` - navigate forward in history
- `GoBack(options)` - navigate back in history
- `Click(selector, click_options, wait_options)` - click on element on page
- `Compose(*actions)` - composition of several puppeteer action
- `Scroll(selector, wait_options)` - scroll page
- `Screenshot(options)` - take screenshot
- `RecaptchaSolver(solve_recaptcha, close_on_empty)` - find or solve recaptcha on page
Expand Down
3 changes: 2 additions & 1 deletion examples/spiders/auto_recaptcha.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import logging

import scrapy
import base64
from twisted.python.failure import Failure

from scrapypuppeteer import PuppeteerRequest
Expand Down
57 changes: 57 additions & 0 deletions examples/spiders/compose.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from logging import ERROR

import scrapy
from scrapy.utils.log import failure_to_exc_info
from twisted.python.failure import Failure

from scrapypuppeteer import (
PuppeteerRequest,
PuppeteerResponse,
PuppeteerScreenshotResponse,
)
from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll


class ComposeSpider(scrapy.Spider):
name = "compose"

custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042,
},
}

def start_requests(self):
goto = GoTo("https://pptr.dev")
click_1 = Click(
"#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)"
)
click_2 = Click(
"#__docusaurus_skipToContent_fallback > div > div > aside > div > "
"div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a"
)
click = Compose(click_1, click_2)
scroll = Scroll()
screenshot = Screenshot(options={"full_page": True, "type": "jpeg"})

compose_action = Compose(
goto,
click,
scroll,
screenshot,
)

yield PuppeteerRequest(
compose_action,
callback=self.parse,
errback=self.errback,
close_page=True,
)

def parse(self, response: PuppeteerResponse):
assert isinstance(response, PuppeteerScreenshotResponse)
self.log("Spider worked fine!")

def errback(self, failure: Failure):
print(failure)
self.log(failure_to_exc_info(failure), level=ERROR)
8 changes: 5 additions & 3 deletions examples/spiders/fill_form.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import base64

import scrapy

from scrapypuppeteer import PuppeteerRequest, PuppeteerScreenshotResponse
from scrapypuppeteer.actions import Screenshot, FillForm
import base64
from scrapypuppeteer.actions import FillForm, Screenshot


class FormActionSpider(scrapy.Spider):
Expand Down Expand Up @@ -34,5 +36,5 @@ def screenshot(self, response):
@staticmethod
def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot
with open(f"screenshot.png", "wb") as fh:
with open("screenshot.png", "wb") as fh:
fh.write(base64.b64decode(data))
1 change: 1 addition & 0 deletions examples/spiders/har.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import scrapy

from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import Har

Expand Down
5 changes: 3 additions & 2 deletions examples/spiders/manual_recaptcha.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import base64
import logging

import scrapy
import base64
from twisted.python.failure import Failure

from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import GoTo, RecaptchaSolver, Click, Screenshot
from scrapypuppeteer.actions import Click, GoTo, RecaptchaSolver, Screenshot
from scrapypuppeteer.response import PuppeteerResponse, PuppeteerScreenshotResponse


Expand Down
2 changes: 1 addition & 1 deletion examples/spiders/meduza.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import scrapy

from scrapypuppeteer import PuppeteerRequest, PuppeteerHtmlResponse
from scrapypuppeteer import PuppeteerHtmlResponse, PuppeteerRequest


class MeduzaSpider(scrapy.Spider):
Expand Down
5 changes: 2 additions & 3 deletions examples/spiders/webscraperio.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import scrapy

from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import GoTo, Scroll, Click
from scrapypuppeteer.actions import Click, GoTo, Scroll


class EcommerceSiteSpider(scrapy.Spider):

@staticmethod
def extract_items(list_page_response):
for item_selector in list_page_response.css("div.row div.thumbnail"):
Expand All @@ -29,7 +28,7 @@ def extract_item(detail_page_response):
"description": detail_page_response.css("p.description::text").get(),
"rating": len(detail_page_response.css("span.glyphicon-star")),
"reviews_count": int(
detail_page_response.css(".ratings::text").re_first("\d+")
detail_page_response.css(".ratings::text").re_first(r"\d+")
),
}

Expand Down
16 changes: 16 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[tool.ruff]
line-length = 88
fix = false
indent-width = 4

[tool.ruff.lint]
select = ["F", "C", "W", "I"]
ignore = ["E203", "E501", "F401", "C408", "F811", "N807"]

[tool.ruff.format]
indent-style = "space"
line-ending = "auto"
quote-style = "double"
skip-magic-trailing-comma = false
docstring-code-line-length = 88
docstring-code-format = true
24 changes: 12 additions & 12 deletions scrapypuppeteer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
from .actions import (
PuppeteerServiceAction,
GoTo,
GoForward,
GoBack,
Click,
Scroll,
Screenshot,
Har,
CustomJsAction,
FillForm,
GoBack,
GoForward,
GoTo,
Har,
PuppeteerServiceAction,
RecaptchaSolver,
CustomJsAction,
Screenshot,
Scroll,
)
from .request import PuppeteerRequest, CloseContextRequest
from .request import CloseContextRequest, PuppeteerRequest
from .response import (
PuppeteerResponse,
PuppeteerHtmlResponse,
PuppeteerScreenshotResponse,
PuppeteerRecaptchaSolverResponse,
PuppeteerJsonResponse,
PuppeteerRecaptchaSolverResponse,
PuppeteerResponse,
PuppeteerScreenshotResponse,
)
45 changes: 41 additions & 4 deletions scrapypuppeteer/actions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from abc import abstractmethod, ABC
from abc import ABC, abstractmethod
from typing import List, Tuple


class PuppeteerServiceAction(ABC):
content_type = "application/json"

@property
@abstractmethod
def endpoint(self): ...

content_type = "application/json"

@abstractmethod
def payload(self): ...

Expand Down Expand Up @@ -292,7 +292,8 @@ class RecaptchaSolver(PuppeteerServiceAction):
Response for this action is PuppeteerJsonResponse. You can get the return values
via self.data['recaptcha_data'].
You can visit https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
You can visit
https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
to get information about return value.
"""

Expand Down Expand Up @@ -335,3 +336,39 @@ def __init__(self, js_action: str):

def payload(self):
return self.js_action


class Compose(PuppeteerServiceAction):
"""
Compose several scrapy-puppeteer actions into one action and send it to the service.
Response for this action is PuppeteerResponse to last action in a sequence.
"""

endpoint = "compose"

def __init__(self, *actions: PuppeteerServiceAction):
self.actions = self.__flatten(actions)

@staticmethod
def __flatten(
actions: Tuple[PuppeteerServiceAction, ...],
) -> List[PuppeteerServiceAction]:
flatten_actions = []
for action in actions:
if isinstance(action, Compose):
flatten_actions.extend(action.actions)
else:
flatten_actions.append(action)
if not flatten_actions:
raise ValueError("No actions provided in `Compose`.")
return flatten_actions

def payload(self):
return {
"actions": [
{"endpoint": action.endpoint, "body": action.payload()}
for action in self.actions
]
}
13 changes: 11 additions & 2 deletions scrapypuppeteer/browser_managers/playwright_browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import base64
import uuid

from playwright.async_api import async_playwright
import syncer
from playwright.async_api import async_playwright

from scrapypuppeteer.browser_managers import BrowserManager
from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest
Expand All @@ -14,7 +14,6 @@


class ContextManager:

def __init__(self):
self.browser = syncer.sync(self.launch_browser())
self.contexts = {}
Expand Down Expand Up @@ -64,6 +63,7 @@ def __init__(self):
self.action_map = {
"goto": self.goto,
"click": self.click,
"compose": self.compose,
"back": self.go_back,
"forward": self.go_forward,
"scroll": self.scroll,
Expand Down Expand Up @@ -358,6 +358,15 @@ async def async_fill_form():

return syncer.sync(async_fill_form())

def compose(self, request: PuppeteerRequest):
_, context_id, page_id = self.get_page_from_request(request)
request.page_id = page_id
request.context_id = context_id

for action in request.action.actions:
response = self.action_map[action.endpoint](request.replace(action=action))
return response.replace(puppeteer_request=request)

def action(self, request: PuppeteerRequest):
raise ValueError("CustomJsAction is not available in local mode")

Expand Down
Loading

0 comments on commit a5bcf16

Please sign in to comment.