Merge branch 'dev' into 0.2.2.dev2

IBM · Oct 31, 2024 · 0b36f29 · 0b36f29
2 parents e30d1ac + a725112
commit 0b36f29
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 14 deletions.
diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py
@@ -85,6 +85,15 @@ def async_crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> Deferred[None]:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -103,12 +112,21 @@ def async_crawl(
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
 
     Returns:
         Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
     """
     if not seed_urls:
-        raise ValueError(f"Empty seed URLs.")
+        raise ValueError("Empty seed URLs.")
     for url in seed_urls:
         if not validate_url(url):
             raise ValueError(f"Seed URL {url} is not valid.")
@@ -119,14 +137,32 @@ def async_crawl(
         raise ValueError(f"Invalid depth limit {depth_limit}")
     if download_limit < -1:
         raise ValueError(f"Invalid download limit {download_limit}")
+    if concurrent_requests < 1:
+        raise ValueError(f"Invalid concurrent requests {concurrent_requests}")
+    if concurrent_requests_per_domain < 1:
+        raise ValueError(
+            f"Invalid concurrent reuqests per domain {concurrent_requests_per_domain}"
+        )
+    if download_delay < 0:
+        raise ValueError(f"Invalid download delay {download_delay}")
+    if download_timeout < 0:
+        raise ValueError(f"Invalid donwload timeout {download_timeout}")
+    if autothrottle_max_delay < 0:
+        raise ValueError(f"Invalid autothrottle max delay {autothrottle_max_delay}")
+    if autothrottle_target_concurrency < 1:
+        raise ValueError(
+            f"Invalid autothrottle target concurrency {autothrottle_target_concurrency}"
+        )
+    if robots_max_crawl_delay < 0:
+        raise ValueError(f"Invalid robots max crawl delay {robots_max_crawl_delay}")
 
     settings = Settings()
     settings.setmodule("dpk_connector.core.settings", priority="project")
 
     if user_agent:
         settings.set("USER_AGENT", user_agent, priority="spider")
     if headers:
-        settings.set("DEFAULT_REQUEST_HEADERS", headers)
+        settings.set("DEFAULT_REQUEST_HEADERS", headers, priority="spider")
     if depth_limit == 0:
         depth_limit = -1
     elif depth_limit == -1:
@@ -135,6 +171,25 @@ def async_crawl(
     if download_limit == -1:
         download_limit = 0
     settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
+    settings.set("CONCURRENT_REQUESTS", concurrent_requests, priority="spider")
+    settings.set(
+        "CONCURRENT_REQUESTS_PER_DOMAIN",
+        concurrent_requests_per_domain,
+        priority="spider",
+    )
+    settings.set("DOWNLOAD_DELAY", download_delay, priority="spider")
+    settings.set(
+        "RANDOMIZE_DOWNLOAD_DELAY", randomize_download_delay, priority="spider"
+    )
+    settings.set("DOWNLOAD_TIMEOUT", download_timeout, priority="spider")
+    settings.set("AUTOTHROTTLE_ENABLED", autothrottle_enabled, priority="spider")
+    settings.set("AUTOTHROTTLE_MAX_DELAY", autothrottle_max_delay, priority="spider")
+    settings.set(
+        "AUTOTHROTTLE_TARGET_CONCURRENCY",
+        autothrottle_target_concurrency,
+        priority="spider",
+    )
+    settings.set("ROBOTS_MAX_CRAWL_DELAY", robots_max_crawl_delay, priority="spider")
 
     runner = MultiThreadedCrawlerRunner(settings)
     runner.crawl(
@@ -169,6 +224,15 @@ def crawl(
     disallow_mime_types: Collection[str] = (),
     depth_limit: int = -1,
     download_limit: int = -1,
+    concurrent_requests: int = 16,
+    concurrent_requests_per_domain: int = 8,
+    download_delay: float = 0,
+    randomize_download_delay: bool = True,
+    download_timeout: float = 180,
+    autothrottle_enabled: bool = True,
+    autothrottle_max_delay: float = 60,
+    autothrottle_target_concurrency: float = 8,
+    robots_max_crawl_delay: float = 60,
 ) -> None:
     # Assisted by WCA@IBM
     # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
@@ -187,6 +251,15 @@ def crawl(
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
         depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
         download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+        concurrent_requests (int): The maximum number of concurrent requests to make. Default is 16.
+        concurrent_requests_per_domain (int): The maximum number of concurrent requests to make per domain. Default is 8.
+        download_delay (float): The delay between consecutive requests. Default is 0.
+        randomize_download_delay (bool): If specified, the download delay will be randomized between 0.5 * `download_delay and 1.5 * `download_delay`. Default is True.
+        download_timeout (float): The timeout for each request. Default is 180 seconds.
+        autothrottle_enabled (bool): If specified, autothrottling will be enabled. Default is True.
+        autothrottle_max_delay (float): The maximum delay between consecutive requests when autothrottling is enabled. Default is 60 seconds.
+        autothrottle_target_concurrency (float): The target concurrency for autothrottling. Default is 8.
+        robots_max_crawl_delay (float): The maximum crawl delay allowed by the robots.txt file. Default is 60 seconds.
 
     Returns:
         None
@@ -209,6 +282,15 @@ def on_completed(result: Any):
         disallow_mime_types,
         depth_limit,
         download_limit,
+        concurrent_requests,
+        concurrent_requests_per_domain,
+        download_delay,
+        randomize_download_delay,
+        download_timeout,
+        autothrottle_enabled,
+        autothrottle_max_delay,
+        autothrottle_target_concurrency,
+        robots_max_crawl_delay,
     )
     d.addBoth(on_completed)
     with condition:

diff --git a/data-connector-lib/src/dpk_connector/core/settings.py b/data-connector-lib/src/dpk_connector/core/settings.py
@@ -16,21 +16,10 @@
 
 # Robots
 ROBOTSTXT_OBEY = True
-ROBOTS_MAX_CRAWL_DELAY = 60
 ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
 
-# Downloader parameters
-CONCURRENT_REQUESTS = 20
-CONCURRENT_REQUESTS_PER_DOMAIN = 10
-DOWNLOAD_DELAY = 0
-RANDOMIZE_DOWNLOAD_DELAY = True
-DOWNLOAD_TIMEOUT = 180
-
 # Autothrottle
-AUTOTHROTTLE_ENABLED = True
 AUTOTHROTTLE_START_DELAY = 0
-AUTOTHROTTLE_MAX_DELAY = 300
-AUTOTHROTTLE_TARGET_CONCURRENCY = 10
 AUTOTHROTTLE_DEBUG = False
 
 # Middlewares/pipelines/extensions

diff --git a/data-connector-lib/test/dpk_connector/core/test_crawler.py b/data-connector-lib/test/dpk_connector/core/test_crawler.py
@@ -37,3 +37,33 @@ def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
     with pytest.raises(ValueError) as e:
         crawl(["http://example.com"], on_downloaded, download_limit=-10)
     assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests=-10)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, concurrent_requests_per_domain=-10)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_timeout=-0.1)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, autothrottle_max_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(
+            ["http://example.com"], on_downloaded, autothrottle_target_concurrency=0.5
+        )
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, robots_max_crawl_delay=-0.1)
+    assert isinstance(e.value, ValueError) is True
diff --git a/resources.md b/resources.md
@@ -15,9 +15,15 @@
 
 ## Example Code
 
-## Tutorials / Blogs
+## Blogs / Tutorials
+
+- [**IBM Developer Blog**](https://developer.ibm.com/blogs/awb-unleash-potential-llms-data-prep-kit/) 
 
 ## Workshops
 
 - **2024-09-21: "RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md)
 
+## Discord
+
+- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1286046139921207476)
+