diff --git a/llm-server/routes/search/search_controller.py b/llm-server/routes/search/search_controller.py index 48e7339bc..8ca81a660 100644 --- a/llm-server/routes/search/search_controller.py +++ b/llm-server/routes/search/search_controller.py @@ -3,6 +3,8 @@ from utils.get_logger import CustomLogger from utils.llm_consts import VectorCollections, initialize_qdrant_client from qdrant_client import models # Add this line +from routes.search.search_service import weighted_search +from pydantic import BaseModel search_workflow = Blueprint("search", __name__) @@ -43,3 +45,32 @@ def search_vector_store(chatbot_id: str): results = get_all_results(chatbot_id, keyword) return jsonify(results), 201 + + +class WeightedSearchRequest(BaseModel): + query: str + title_weight: float = 0.7 + description_weight: float = 0.3 + + +@search_workflow.route("/cmd_bar/", methods=["POST"]) +def get_cmdbar_data(chatbot_id: str): + try: + request_data = WeightedSearchRequest( + **request.get_json() + ) # Assuming you have a class to parse data + scored_points = weighted_search( + chatbot_id, + request_data.query, + request_data.title_weight, + request_data.description_weight, + ) + return ( + jsonify([sp.model_dump() for sp in scored_points]), + 200, + ) + + except ValueError as e: # Example of handling a potential error + return jsonify({"error": str(e)}), 400 # Bad request + except Exception as e: + return jsonify({"error": "Internal server error"}), 500 diff --git a/llm-server/routes/search/search_service.py b/llm-server/routes/search/search_service.py index 877130636..16b50e651 100644 --- a/llm-server/routes/search/search_service.py +++ b/llm-server/routes/search/search_service.py @@ -5,6 +5,7 @@ from typing import Dict, List, Optional import operator from copy import deepcopy +from utils.llm_consts import ENABLE_NEURAL_SEARCH client = initialize_qdrant_client() embedding = get_embeddings() @@ -64,7 +65,10 @@ def add_cmdbar_data(items: List[Item], metadata: Dict[str, str]) -> None: # Function to search with weights def weighted_search( - query: str, title_weight: float = 0.7, description_weight: float = 0.3 + chatbot_id: str, + query: str, + title_weight: float = 0.7, + description_weight: float = 0.3, ) -> List[models.ScoredPoint]: query_embedding = embedding.embed_query(query) @@ -72,15 +76,33 @@ def weighted_search( title_results = client.search( collection_name=VectorCollections.neural_search, query_vector=models.NamedVector(name="title", vector=query_embedding), + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="metadata.bot_id", + match=models.MatchValue(value=str(chatbot_id)), + ) + ] + ), + limit=20, with_payload=True, - with_vector=False, + with_vectors=False, ) description_results = client.search( collection_name=VectorCollections.neural_search, query_vector=models.NamedVector(name="description", vector=query_embedding), + query_filter=models.Filter( + must=[ + models.FieldCondition( + key="metadata.bot_id", + match=models.MatchValue(value=chatbot_id), + ) + ] + ), + limit=20, with_payload=True, - with_vector=False, + with_vectors=False, ) # Build a lookup for description results diff --git a/llm-server/utils/llm_consts.py b/llm-server/utils/llm_consts.py index 23ba0e4b8..cde4a0ab1 100644 --- a/llm-server/utils/llm_consts.py +++ b/llm-server/utils/llm_consts.py @@ -121,3 +121,5 @@ def get_mysql_uri(): ) JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "YOURSUPERSECRETKEY") + +ENABLE_NEURAL_SEARCH = os.getenv("ENABLE_NEURAL_SEARCH", "NO") == "YES" diff --git a/llm-server/workers/tasks/url_parsers.py b/llm-server/workers/tasks/url_parsers.py index ce8d3f4f0..50231e07a 100644 --- a/llm-server/workers/tasks/url_parsers.py +++ b/llm-server/workers/tasks/url_parsers.py @@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]: def find_all_headings_and_highlights( self, content: str ) -> Tuple[str, List[Tuple[str, str]]]: - soup = BeautifulSoup(content, "lxml") - title_tag = soup.title - title = "" - if title_tag is not None: - title = title_tag.get_text(strip=True) - - headings: List[Tuple[str, str]] = [] - - for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): - heading_text = heading.get_text(strip=True) - # Check if the heading or one of its children has an 'id' attribute - id_tag = heading.find(attrs={"id": True}) - if id_tag: - heading_id = id_tag["id"] - headings.append((heading_text, heading_id)) - - return title, headings + soup = BeautifulSoup(content, "lxml") + title = soup.title.text if soup.title else "" + elements_with_id = soup.find_all(id=True) + links = soup.find_all("a") + pairs = [] + for element in elements_with_id: + id_ = element.get("id") + if id_: # A simple check if the id exists + corresponding_links = [ + link for link in links if link.get("href") == "#" + id_ + ] # Removed "./#" prefix + if corresponding_links: + for link in corresponding_links: + pairs.append((element.get_text(strip=True), id_)) + return title, pairs def parse_text_content(self, content) -> str: text = BeautifulSoup(content, "lxml").get_text() diff --git a/llm-server/workers/tasks/web_crawl.py b/llm-server/workers/tasks/web_crawl.py index 37379a539..207b28225 100644 --- a/llm-server/workers/tasks/web_crawl.py +++ b/llm-server/workers/tasks/web_crawl.py @@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str): for heading_text, heading_id in headings ] - add_cmdbar_data(items, {"url": url, "bot_id": bot_id}) + if len(items) > 0: + add_cmdbar_data(items, {"url": url, "bot_id": bot_id}) return parser.parse_text_content(content) except ValueError as e: # Log an error message if no parser is available for the content type @@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int: chatbot_id=bot_id, url=current_url, status="SUCCESS" ) - # Get links on the current page - links = get_links(current_url) - - # Add new links to the queue - queue.extend(links) + links = get_links(current_url) + queue.extend(links) except Exception as e: logger.error("WEB_SCRAPE_ERROR", error=e)