Skip to content
This repository has been archived by the owner on Jan 5, 2025. It is now read-only.

Commit

Permalink
Merge pull request #655 from openchatai/feat/neural_search
Browse files Browse the repository at this point in the history
Feat/neural search
  • Loading branch information
codebanesr authored Feb 24, 2024
2 parents d683835 + a11d727 commit 7e6386a
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 26 deletions.
31 changes: 31 additions & 0 deletions llm-server/routes/search/search_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from utils.get_logger import CustomLogger
from utils.llm_consts import VectorCollections, initialize_qdrant_client
from qdrant_client import models # Add this line
from routes.search.search_service import weighted_search
from pydantic import BaseModel

search_workflow = Blueprint("search", __name__)

Expand Down Expand Up @@ -43,3 +45,32 @@ def search_vector_store(chatbot_id: str):
results = get_all_results(chatbot_id, keyword)

return jsonify(results), 201


class WeightedSearchRequest(BaseModel):
query: str
title_weight: float = 0.7
description_weight: float = 0.3


@search_workflow.route("/cmd_bar/<chatbot_id>", methods=["POST"])
def get_cmdbar_data(chatbot_id: str):
try:
request_data = WeightedSearchRequest(
**request.get_json()
) # Assuming you have a class to parse data
scored_points = weighted_search(
chatbot_id,
request_data.query,
request_data.title_weight,
request_data.description_weight,
)
return (
jsonify([sp.model_dump() for sp in scored_points]),
200,
)

except ValueError as e: # Example of handling a potential error
return jsonify({"error": str(e)}), 400 # Bad request
except Exception as e:
return jsonify({"error": "Internal server error"}), 500
28 changes: 25 additions & 3 deletions llm-server/routes/search/search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Dict, List, Optional
import operator
from copy import deepcopy
from utils.llm_consts import ENABLE_NEURAL_SEARCH

client = initialize_qdrant_client()
embedding = get_embeddings()
Expand Down Expand Up @@ -64,23 +65,44 @@ def add_cmdbar_data(items: List[Item], metadata: Dict[str, str]) -> None:

# Function to search with weights
def weighted_search(
query: str, title_weight: float = 0.7, description_weight: float = 0.3
chatbot_id: str,
query: str,
title_weight: float = 0.7,
description_weight: float = 0.3,
) -> List[models.ScoredPoint]:
query_embedding = embedding.embed_query(query)

# Search title and descriptions
title_results = client.search(
collection_name=VectorCollections.neural_search,
query_vector=models.NamedVector(name="title", vector=query_embedding),
query_filter=models.Filter(
must=[
models.FieldCondition(
key="metadata.bot_id",
match=models.MatchValue(value=str(chatbot_id)),
)
]
),
limit=20,
with_payload=True,
with_vector=False,
with_vectors=False,
)

description_results = client.search(
collection_name=VectorCollections.neural_search,
query_vector=models.NamedVector(name="description", vector=query_embedding),
query_filter=models.Filter(
must=[
models.FieldCondition(
key="metadata.bot_id",
match=models.MatchValue(value=chatbot_id),
)
]
),
limit=20,
with_payload=True,
with_vector=False,
with_vectors=False,
)

# Build a lookup for description results
Expand Down
2 changes: 2 additions & 0 deletions llm-server/utils/llm_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,5 @@ def get_mysql_uri():
)

JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY")

ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES"
32 changes: 15 additions & 17 deletions llm-server/workers/tasks/url_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
def find_all_headings_and_highlights(
self, content: str
) -> Tuple[str, List[Tuple[str, str]]]:
soup = BeautifulSoup(content, "lxml")
title_tag = soup.title
title = ""
if title_tag is not None:
title = title_tag.get_text(strip=True)

headings: List[Tuple[str, str]] = []

for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
heading_text = heading.get_text(strip=True)

# Check if the heading or one of its children has an 'id' attribute
id_tag = heading.find(attrs={"id": True})
if id_tag:
heading_id = id_tag["id"]
headings.append((heading_text, heading_id))

return title, headings
soup = BeautifulSoup(content, "lxml")
title = soup.title.text if soup.title else ""
elements_with_id = soup.find_all(id=True)
links = soup.find_all("a")
pairs = []
for element in elements_with_id:
id_ = element.get("id")
if id_: # A simple check if the id exists
corresponding_links = [
link for link in links if link.get("href") == "#" + id_
] # Removed "./#" prefix
if corresponding_links:
for link in corresponding_links:
pairs.append((element.get_text(strip=True), id_))
return title, pairs

def parse_text_content(self, content) -> str:
text = BeautifulSoup(content, "lxml").get_text()
Expand Down
10 changes: 4 additions & 6 deletions llm-server/workers/tasks/web_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
for heading_text, heading_id in headings
]

add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
if len(items) > 0:
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
return parser.parse_text_content(content)
except ValueError as e:
# Log an error message if no parser is available for the content type
Expand Down Expand Up @@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
chatbot_id=bot_id, url=current_url, status="SUCCESS"
)

# Get links on the current page
links = get_links(current_url)

# Add new links to the queue
queue.extend(links)
links = get_links(current_url)
queue.extend(links)

except Exception as e:
logger.error("WEB_SCRAPE_ERROR", error=e)
Expand Down

0 comments on commit 7e6386a

Please sign in to comment.