fix: search

ScrapeGraphAI · Jan 20, 2025 · ce25b6a · ce25b6a
1 parent c5fbb8b
commit ce25b6a
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 34 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,10 +27,10 @@ dependencies = [
     "playwright>=1.43.0",
     "undetected-playwright>=0.3.0",
     "semchunk>=2.2.0",
-    "googlesearch-python>=1.2.5",
     "async-timeout>=4.0.3",
     "simpleeval>=1.0.0",
-    "jsonschema>=4.23.0"
+    "jsonschema>=4.23.0",
+    "duckduckgo-search>=7.2.1"
 ]
 
 readme = "README.md"

diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -48,7 +48,7 @@ def __init__(
         self.search_engine = (
             node_config["search_engine"]
             if node_config.get("search_engine")
-            else "google"
+            else "duckduckgo"
         )
 
         self.serper_api_key = (

diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -7,13 +7,12 @@
 
 import requests
 from bs4 import BeautifulSoup
-from googlesearch import search as google_search
 from langchain_community.tools import DuckDuckGoSearchResults
 
 
 def search_on_web(
     query: str,
-    search_engine: str = "Google",
+    search_engine: str = "duckduckgo",
     max_results: int = 10,
     port: int = 8080,
     timeout: int = 10,
@@ -41,7 +40,7 @@ def search_on_web(
         raise ValueError("Query must be a non-empty string")
 
     search_engine = search_engine.lower()
-    valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
+    valid_engines = {"duckduckgo", "bing", "searxng", "serper"}
     if search_engine not in valid_engines:
         raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
 
@@ -52,20 +51,12 @@ def search_on_web(
 
     try:
         results = []
-        if search_engine == "google":
-            kwargs = {
-                "num_results": max_results,
-                "proxy": formatted_proxy,
-                "lang": language,
-            }
-            if region:
-                kwargs["region"] = region
-
-            results = list(google_search(query, **kwargs))
-
-        elif search_engine == "duckduckgo":
+        if search_engine == "duckduckgo":
+            # Create a DuckDuckGo search object with max_results
             research = DuckDuckGoSearchResults(max_results=max_results)
+            # Run the search
             res = research.run(query)
+            # Extract URLs using regex
             results = re.findall(r"https?://[^\s,\]]+", res)
 
         elif search_engine == "bing":
@@ -74,7 +65,7 @@ def search_on_web(
         elif search_engine == "searxng":
             results = _search_searxng(query, max_results, port, timeout)
 
-        elif search_engine.lower() == "serper":
+        elif search_engine == "serper":
             results = _search_serper(query, max_results, serper_api_key, timeout)
 
         return filter_pdf_links(results)

diff --git a/uv.lock b/uv.lock