From d83a924e69c45b20d37dbfd12918ca15cfe0693a Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Tue, 29 Oct 2024 15:39:35 -0700 Subject: [PATCH 1/2] Update resources.md --- resources.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/resources.md b/resources.md index 04a87f8c0..974ca2ccb 100644 --- a/resources.md +++ b/resources.md @@ -15,9 +15,15 @@ ## Example Code -## Tutorials / Blogs +## Blogs / Tutorials + +- [**IBM Developer Blog**](https://developer.ibm.com/blogs/awb-unleash-potential-llms-data-prep-kit/) ## Workshops - **2024-09-21: "RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md) +## Discord + +- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1286046139921207476) + From a725112c7166f0d83a2079a340eccda4c0a0fd02 Mon Sep 17 00:00:00 2001 From: Hiroya Matsubara Date: Thu, 31 Oct 2024 09:08:23 +0900 Subject: [PATCH 2/2] allow the user to customize crawler settings (#738) * allow the user to customize crawler settings Signed-off-by: Hiroya Matsubara * remove unused values Signed-off-by: Hiroya Matsubara * update concurrency defaults Signed-off-by: Hiroya Matsubara * update defaults Signed-off-by: Hiroya Matsubara --------- Signed-off-by: Hiroya Matsubara --- .../src/dpk_connector/core/crawler.py | 86 ++++++++++++++++++- .../src/dpk_connector/core/settings.py | 11 --- .../test/dpk_connector/core/test_crawler.py | 30 +++++++ 3 files changed, 114 insertions(+), 13 deletions(-) diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py index 491806398..4e706a944 100644 --- a/data-connector-lib/src/dpk_connector/core/crawler.py +++ b/data-connector-lib/src/dpk_connector/core/crawler.py @@ -85,6 +85,15 @@ def async_crawl( disallow_mime_types: Collection[str] = (), depth_limit: int = -1, download_limit: int = -1, + concurrent_requests: int = 16, + concurrent_requests_per_domain: int = 8, + download_delay: float = 0, + randomize_download_delay: bool = True, + download_timeout: float = 180, + autothrottle_enabled: bool = True, + autothrottle_max_delay: float = 60, + autothrottle_target_concurrency: float = 8, + robots_max_crawl_delay: float = 60, ) -> Deferred[None]: # Assisted by WCA@IBM # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 @@ -103,12 +112,21 @@ def async_crawl( disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit. download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit. + concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16. + concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8. + download_delay (float): The delay between consecutive requests. Default is 0. + randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True. + download_timeout (float): The timeout for each request. Default is 180 seconds. + autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True. + autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds. + autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8. + robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds. Returns: Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish. """ if not seed_urls: - raise ValueError(f"Empty seed URLs.") + raise ValueError("Empty seed URLs.") for url in seed_urls: if not validate_url(url): raise ValueError(f"Seed URL {url} is not valid.") @@ -119,6 +137,24 @@ def async_crawl( raise ValueError(f"Invalid depth limit {depth_limit}") if download_limit < -1: raise ValueError(f"Invalid download limit {download_limit}") + if concurrent_requests < 1: + raise ValueError(f"Invalid concurrent requests {concurrent_requests}") + if concurrent_requests_per_domain < 1: + raise ValueError( + f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}" + ) + if download_delay < 0: + raise ValueError(f"Invalid download delay {download_delay}") + if download_timeout < 0: + raise ValueError(f"Invalid donwload timeout {download_timeout}") + if autothrottle_max_delay < 0: + raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}") + if autothrottle_target_concurrency < 1: + raise ValueError( + f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}" + ) + if robots_max_crawl_delay < 0: + raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}") settings = Settings() settings.setmodule("dpk_connector.core.settings", priority="project") @@ -126,7 +162,7 @@ def async_crawl( if user_agent: settings.set("USER_AGENT", user_agent, priority="spider") if headers: - settings.set("DEFAULT_REQUEST_HEADERS", headers) + settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider") if depth_limit == 0: depth_limit = -1 elif depth_limit == -1: @@ -135,6 +171,25 @@ def async_crawl( if download_limit == -1: download_limit = 0 settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider") + settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider") + settings.set( + "CONCURRENT_REQUESTS_PER_DOMAIN", + concurrent_requests_per_domain, + priority="spider", + ) + settings.set("DOWNLOAD_DELAY", download_delay, priority="spider") + settings.set( + "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider" + ) + settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider") + settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider") + settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider") + settings.set( + "AUTOTHROTTLE_TARGET_CONCURRENCY", + autothrottle_target_concurrency, + priority="spider", + ) + settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider") runner = MultiThreadedCrawlerRunner(settings) runner.crawl( @@ -169,6 +224,15 @@ def crawl( disallow_mime_types: Collection[str] = (), depth_limit: int = -1, download_limit: int = -1, + concurrent_requests: int = 16, + concurrent_requests_per_domain: int = 8, + download_delay: float = 0, + randomize_download_delay: bool = True, + download_timeout: float = 180, + autothrottle_enabled: bool = True, + autothrottle_max_delay: float = 60, + autothrottle_target_concurrency: float = 8, + robots_max_crawl_delay: float = 60, ) -> None: # Assisted by WCA@IBM # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 @@ -187,6 +251,15 @@ def crawl( disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit. download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit. + concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16. + concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8. + download_delay (float): The delay between consecutive requests. Default is 0. + randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True. + download_timeout (float): The timeout for each request. Default is 180 seconds. + autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True. + autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds. + autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8. + robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds. Returns: None @@ -209,6 +282,15 @@ def on_completed(result: Any): disallow_mime_types, depth_limit, download_limit, + concurrent_requests, + concurrent_requests_per_domain, + download_delay, + randomize_download_delay, + download_timeout, + autothrottle_enabled, + autothrottle_max_delay, + autothrottle_target_concurrency, + robots_max_crawl_delay, ) d.addBoth(on_completed) with condition: diff --git a/data-connector-lib/src/dpk_connector/core/settings.py b/data-connector-lib/src/dpk_connector/core/settings.py index 041ada253..f6564284a 100644 --- a/data-connector-lib/src/dpk_connector/core/settings.py +++ b/data-connector-lib/src/dpk_connector/core/settings.py @@ -16,21 +16,10 @@ # Robots ROBOTSTXT_OBEY = True -ROBOTS_MAX_CRAWL_DELAY = 60 ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser" -# Downloader parameters -CONCURRENT_REQUESTS = 20 -CONCURRENT_REQUESTS_PER_DOMAIN = 10 -DOWNLOAD_DELAY = 0 -RANDOMIZE_DOWNLOAD_DELAY = True -DOWNLOAD_TIMEOUT = 180 - # Autothrottle -AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_START_DELAY = 0 -AUTOTHROTTLE_MAX_DELAY = 300 -AUTOTHROTTLE_TARGET_CONCURRENCY = 10 AUTOTHROTTLE_DEBUG = False # Middlewares/pipelines/extensions diff --git a/data-connector-lib/test/dpk_connector/core/test_crawler.py b/data-connector-lib/test/dpk_connector/core/test_crawler.py index 01adecf6e..b4fb3582c 100644 --- a/data-connector-lib/test/dpk_connector/core/test_crawler.py +++ b/data-connector-lib/test/dpk_connector/core/test_crawler.py @@ -37,3 +37,33 @@ def on_downloaded(url: str, body: bytes, headers: dict[str, str]): with pytest.raises(ValueError) as e: crawl(["http://example.com"], on_downloaded, download_limit=-10) assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, concurrent_requests=-10) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, download_delay=-0.1) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, download_timeout=-0.1) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl( + ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5 + ) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1) + assert isinstance(e.value, ValueError) is True