Skip to content

Commit

Permalink
Merge branch 'dev' into 0.2.2.dev2
Browse files Browse the repository at this point in the history
  • Loading branch information
touma-I committed Oct 31, 2024
2 parents e30d1ac + a725112 commit 0b36f29
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 14 deletions.
86 changes: 84 additions & 2 deletions data-connector-lib/src/dpk_connector/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ def async_crawl(
disallow_mime_types: Collection[str] = (),
depth_limit: int = -1,
download_limit: int = -1,
concurrent_requests: int = 16,
concurrent_requests_per_domain: int = 8,
download_delay: float = 0,
randomize_download_delay: bool = True,
download_timeout: float = 180,
autothrottle_enabled: bool = True,
autothrottle_max_delay: float = 60,
autothrottle_target_concurrency: float = 8,
robots_max_crawl_delay: float = 60,
) -> Deferred[None]:
# Assisted by WCA@IBM
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
Expand All @@ -103,12 +112,21 @@ def async_crawl(
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
download_delay (float): The delay between consecutive requests. Default is 0.
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
download_timeout (float): The timeout for each request. Default is 180 seconds.
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
Returns:
Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
"""
if not seed_urls:
raise ValueError(f"Empty seed URLs.")
raise ValueError("Empty seed URLs.")
for url in seed_urls:
if not validate_url(url):
raise ValueError(f"Seed URL {url} is not valid.")
Expand All @@ -119,14 +137,32 @@ def async_crawl(
raise ValueError(f"Invalid depth limit {depth_limit}")
if download_limit < -1:
raise ValueError(f"Invalid download limit {download_limit}")
if concurrent_requests < 1:
raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
if concurrent_requests_per_domain < 1:
raise ValueError(
f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
)
if download_delay < 0:
raise ValueError(f"Invalid download delay {download_delay}")
if download_timeout < 0:
raise ValueError(f"Invalid donwload timeout {download_timeout}")
if autothrottle_max_delay < 0:
raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
if autothrottle_target_concurrency < 1:
raise ValueError(
f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
)
if robots_max_crawl_delay < 0:
raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")

settings = Settings()
settings.setmodule("dpk_connector.core.settings", priority="project")

if user_agent:
settings.set("USER_AGENT", user_agent, priority="spider")
if headers:
settings.set("DEFAULT_REQUEST_HEADERS", headers)
settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
if depth_limit == 0:
depth_limit = -1
elif depth_limit == -1:
Expand All @@ -135,6 +171,25 @@ def async_crawl(
if download_limit == -1:
download_limit = 0
settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
settings.set(
"CONCURRENT_REQUESTS_PER_DOMAIN",
concurrent_requests_per_domain,
priority="spider",
)
settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
settings.set(
"RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
)
settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
settings.set(
"AUTOTHROTTLE_TARGET_CONCURRENCY",
autothrottle_target_concurrency,
priority="spider",
)
settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")

runner = MultiThreadedCrawlerRunner(settings)
runner.crawl(
Expand Down Expand Up @@ -169,6 +224,15 @@ def crawl(
disallow_mime_types: Collection[str] = (),
depth_limit: int = -1,
download_limit: int = -1,
concurrent_requests: int = 16,
concurrent_requests_per_domain: int = 8,
download_delay: float = 0,
randomize_download_delay: bool = True,
download_timeout: float = 180,
autothrottle_enabled: bool = True,
autothrottle_max_delay: float = 60,
autothrottle_target_concurrency: float = 8,
robots_max_crawl_delay: float = 60,
) -> None:
# Assisted by WCA@IBM
# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
Expand All @@ -187,6 +251,15 @@ def crawl(
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
download_delay (float): The delay between consecutive requests. Default is 0.
randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
download_timeout (float): The timeout for each request. Default is 180 seconds.
autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
Returns:
None
Expand All @@ -209,6 +282,15 @@ def on_completed(result: Any):
disallow_mime_types,
depth_limit,
download_limit,
concurrent_requests,
concurrent_requests_per_domain,
download_delay,
randomize_download_delay,
download_timeout,
autothrottle_enabled,
autothrottle_max_delay,
autothrottle_target_concurrency,
robots_max_crawl_delay,
)
d.addBoth(on_completed)
with condition:
Expand Down
11 changes: 0 additions & 11 deletions data-connector-lib/src/dpk_connector/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,10 @@

# Robots
ROBOTSTXT_OBEY = True
ROBOTS_MAX_CRAWL_DELAY = 60
ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"

# Downloader parameters
CONCURRENT_REQUESTS = 20
CONCURRENT_REQUESTS_PER_DOMAIN = 10
DOWNLOAD_DELAY = 0
RANDOMIZE_DOWNLOAD_DELAY = True
DOWNLOAD_TIMEOUT = 180

# Autothrottle
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 0
AUTOTHROTTLE_MAX_DELAY = 300
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
AUTOTHROTTLE_DEBUG = False

# Middlewares/pipelines/extensions
Expand Down
30 changes: 30 additions & 0 deletions data-connector-lib/test/dpk_connector/core/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,33 @@ def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, download_limit=-10)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(
["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
)
assert isinstance(e.value, ValueError) is True

with pytest.raises(ValueError) as e:
crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
assert isinstance(e.value, ValueError) is True
8 changes: 7 additions & 1 deletion resources.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,15 @@

## Example Code

## Tutorials / Blogs
## Blogs / Tutorials

- [**IBM Developer Blog**](https://developer.ibm.com/blogs/awb-unleash-potential-llms-data-prep-kit/)

## Workshops

- **2024-09-21: "RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md)

## Discord

- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1286046139921207476)

0 comments on commit 0b36f29

Please sign in to comment.