Skip to content

Commit

Permalink
fix: search
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Jan 20, 2025
1 parent c5fbb8b commit ce25b6a
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 34 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ dependencies = [
"playwright>=1.43.0",
"undetected-playwright>=0.3.0",
"semchunk>=2.2.0",
"googlesearch-python>=1.2.5",
"async-timeout>=4.0.3",
"simpleeval>=1.0.0",
"jsonschema>=4.23.0"
"jsonschema>=4.23.0",
"duckduckgo-search>=7.2.1"
]

readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
self.search_engine = (
node_config["search_engine"]
if node_config.get("search_engine")
else "google"
else "duckduckgo"
)

self.serper_api_key = (
Expand Down
23 changes: 7 additions & 16 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@

import requests
from bs4 import BeautifulSoup
from googlesearch import search as google_search
from langchain_community.tools import DuckDuckGoSearchResults


def search_on_web(
query: str,
search_engine: str = "Google",
search_engine: str = "duckduckgo",
max_results: int = 10,
port: int = 8080,
timeout: int = 10,
Expand Down Expand Up @@ -41,7 +40,7 @@ def search_on_web(
raise ValueError("Query must be a non-empty string")

search_engine = search_engine.lower()
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
valid_engines = {"duckduckgo", "bing", "searxng", "serper"}
if search_engine not in valid_engines:
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")

Expand All @@ -52,20 +51,12 @@ def search_on_web(

try:
results = []
if search_engine == "google":
kwargs = {
"num_results": max_results,
"proxy": formatted_proxy,
"lang": language,
}
if region:
kwargs["region"] = region

results = list(google_search(query, **kwargs))

elif search_engine == "duckduckgo":
if search_engine == "duckduckgo":
# Create a DuckDuckGo search object with max_results
research = DuckDuckGoSearchResults(max_results=max_results)
# Run the search
res = research.run(query)
# Extract URLs using regex
results = re.findall(r"https?://[^\s,\]]+", res)

elif search_engine == "bing":
Expand All @@ -74,7 +65,7 @@ def search_on_web(
elif search_engine == "searxng":
results = _search_searxng(query, max_results, port, timeout)

elif search_engine.lower() == "serper":
elif search_engine == "serper":
results = _search_serper(query, max_results, serper_api_key, timeout)

return filter_pdf_links(results)
Expand Down
47 changes: 32 additions & 15 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ce25b6a

Please sign in to comment.