From 5ffe0df8417e070e0655f4b4d3244e0e9bc09ad8 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Sat, 23 Nov 2024 14:31:08 -0800
Subject: [PATCH] k

---
 backend/danswer/configs/chat_configs.py       |  5 ---
 .../llm/answering/prompts/quotes_prompt.py    | 38 +------------------
 backend/danswer/one_shot_answer/models.py     |  4 --
 backend/danswer/prompts/direct_qa_prompts.py  | 12 ------
 .../secondary_llm_flows/query_validation.py   |  6 +--
 .../danswer/server/query_and_chat/models.py   |  5 +--
 .../server/query_and_chat/query_backend.py    | 31 ---------------
 7 files changed, 5 insertions(+), 96 deletions(-)

diff --git a/backend/danswer/configs/chat_configs.py b/backend/danswer/configs/chat_configs.py
index a72baacf686..2d72bed0f5a 100644
--- a/backend/danswer/configs/chat_configs.py
+++ b/backend/danswer/configs/chat_configs.py
@@ -17,9 +17,6 @@
 # ~3k input, half for docs, half for chat history + prompts
 CHAT_TARGET_CHUNK_PERCENTAGE = 512 * 3 / 3072
 
-# For selecting a different LLM question-answering prompt format
-# Valid values: default, cot, weak
-QA_PROMPT_OVERRIDE = os.environ.get("QA_PROMPT_OVERRIDE") or None
 # 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay
 # Capped in Vespa at 0.5
 DOC_TIME_DECAY = float(
@@ -27,8 +24,6 @@
 )
 BASE_RECENCY_DECAY = 0.5
 FAVOR_RECENT_DECAY_MULTIPLIER = 2.0
-# Currently this next one is not configurable via env
-DISABLE_LLM_QUERY_ANSWERABILITY = QA_PROMPT_OVERRIDE == "weak"
 # For the highest matching base size chunk, how many chunks above and below do we pull in by default
 # Note this is not in any of the deployment configs yet
 # Currently only applies to search flow not chat
diff --git a/backend/danswer/llm/answering/prompts/quotes_prompt.py b/backend/danswer/llm/answering/prompts/quotes_prompt.py
index 3cdaaefcfdb..00f22f9e7df 100644
--- a/backend/danswer/llm/answering/prompts/quotes_prompt.py
+++ b/backend/danswer/llm/answering/prompts/quotes_prompt.py
@@ -2,7 +2,6 @@
 
 from danswer.chat.models import LlmDoc
 from danswer.configs.chat_configs import LANGUAGE_HINT
-from danswer.configs.chat_configs import QA_PROMPT_OVERRIDE
 from danswer.context.search.models import InferenceChunk
 from danswer.db.search_settings import get_multilingual_expansion
 from danswer.llm.answering.models import PromptConfig
@@ -10,39 +9,10 @@
 from danswer.prompts.direct_qa_prompts import CONTEXT_BLOCK
 from danswer.prompts.direct_qa_prompts import HISTORY_BLOCK
 from danswer.prompts.direct_qa_prompts import JSON_PROMPT
-from danswer.prompts.direct_qa_prompts import WEAK_LLM_PROMPT
 from danswer.prompts.prompt_utils import add_date_time_to_prompt
 from danswer.prompts.prompt_utils import build_complete_context_str
 
 
-def _build_weak_llm_quotes_prompt(
-    question: str,
-    context_docs: list[LlmDoc] | list[InferenceChunk],
-    history_str: str,
-    prompt: PromptConfig,
-) -> HumanMessage:
-    """Since Danswer supports a variety of LLMs, this less demanding prompt is provided
-    as an option to use with weaker LLMs such as small version, low float precision, quantized,
-    or distilled models. It only uses one context document and has very weak requirements of
-    output format.
-    """
-    context_block = ""
-    if context_docs:
-        context_block = CONTEXT_BLOCK.format(context_docs_str=context_docs[0].content)
-
-    prompt_str = WEAK_LLM_PROMPT.format(
-        system_prompt=prompt.system_prompt,
-        context_block=context_block,
-        task_prompt=prompt.task_prompt,
-        user_query=question,
-    )
-
-    if prompt.datetime_aware:
-        prompt_str = add_date_time_to_prompt(prompt_str=prompt_str)
-
-    return HumanMessage(content=prompt_str)
-
-
 def _build_strong_llm_quotes_prompt(
     question: str,
     context_docs: list[LlmDoc] | list[InferenceChunk],
@@ -81,15 +51,9 @@ def build_quotes_user_message(
     history_str: str,
     prompt: PromptConfig,
 ) -> HumanMessage:
-    prompt_builder = (
-        _build_weak_llm_quotes_prompt
-        if QA_PROMPT_OVERRIDE == "weak"
-        else _build_strong_llm_quotes_prompt
-    )
-
     query, _ = message_to_prompt_and_imgs(message)
 
-    return prompt_builder(
+    return _build_strong_llm_quotes_prompt(
         question=query,
         context_docs=context_docs,
         history_str=history_str,
diff --git a/backend/danswer/one_shot_answer/models.py b/backend/danswer/one_shot_answer/models.py
index 21463867d28..630c7b5cab4 100644
--- a/backend/danswer/one_shot_answer/models.py
+++ b/backend/danswer/one_shot_answer/models.py
@@ -36,10 +36,6 @@ class PromptConfig(BaseModel):
     datetime_aware: bool = True
 
 
-class DocumentSetConfig(BaseModel):
-    id: int
-
-
 class ToolConfig(BaseModel):
     id: int
 
diff --git a/backend/danswer/prompts/direct_qa_prompts.py b/backend/danswer/prompts/direct_qa_prompts.py
index b1229b896a7..b00cfcebf15 100644
--- a/backend/danswer/prompts/direct_qa_prompts.py
+++ b/backend/danswer/prompts/direct_qa_prompts.py
@@ -118,18 +118,6 @@
 """
 
 
-# For weak LLM which only takes one chunk and cannot output json
-# Also not requiring quotes as it tends to not work
-WEAK_LLM_PROMPT = f"""
-{{system_prompt}}
-{{context_block}}
-{{task_prompt}}
-
-{QUESTION_PAT.upper()}
-{{user_query}}
-""".strip()
-
-
 # This is only for visualization for the users to specify their own prompts
 # The actual flow does not work like this
 PARAMATERIZED_PROMPT = f"""
diff --git a/backend/danswer/secondary_llm_flows/query_validation.py b/backend/danswer/secondary_llm_flows/query_validation.py
index 2ee428f0090..d11e603715e 100644
--- a/backend/danswer/secondary_llm_flows/query_validation.py
+++ b/backend/danswer/secondary_llm_flows/query_validation.py
@@ -1,9 +1,9 @@
+# NOTE No longer used. This needs to be revisited later.
 import re
 from collections.abc import Iterator
 
 from danswer.chat.models import DanswerAnswerPiece
 from danswer.chat.models import StreamingError
-from danswer.configs.chat_configs import DISABLE_LLM_QUERY_ANSWERABILITY
 from danswer.llm.exceptions import GenAIDisabledException
 from danswer.llm.factory import get_default_llms
 from danswer.llm.utils import dict_based_prompt_to_langchain_prompt
@@ -46,7 +46,7 @@ def extract_answerability_bool(model_raw: str) -> bool:
 
 
 def get_query_answerability(
-    user_query: str, skip_check: bool = DISABLE_LLM_QUERY_ANSWERABILITY
+    user_query: str, skip_check: bool = False
 ) -> tuple[str, bool]:
     if skip_check:
         return "Query Answerability Evaluation feature is turned off", True
@@ -67,7 +67,7 @@ def get_query_answerability(
 
 
 def stream_query_answerability(
-    user_query: str, skip_check: bool = DISABLE_LLM_QUERY_ANSWERABILITY
+    user_query: str, skip_check: bool = False
 ) -> Iterator[str]:
     if skip_check:
         yield get_json_line(
diff --git a/backend/danswer/server/query_and_chat/models.py b/backend/danswer/server/query_and_chat/models.py
index c316435996e..ae6e651fff1 100644
--- a/backend/danswer/server/query_and_chat/models.py
+++ b/backend/danswer/server/query_and_chat/models.py
@@ -29,10 +29,6 @@ class TagResponse(BaseModel):
     tags: list[SourceTag]
 
 
-class SimpleQueryRequest(BaseModel):
-    query: str
-
-
 class UpdateChatSessionThreadRequest(BaseModel):
     # If not specified, use Danswer default persona
     chat_session_id: UUID
@@ -217,6 +213,7 @@ class ChatSessionDetailResponse(BaseModel):
     current_alternate_model: str | None
 
 
+# This one is not used anymore
 class QueryValidationResponse(BaseModel):
     reasoning: str
     answerable: bool
diff --git a/backend/danswer/server/query_and_chat/query_backend.py b/backend/danswer/server/query_and_chat/query_backend.py
index 6fb848dfa38..f07d98f0aa9 100644
--- a/backend/danswer/server/query_and_chat/query_backend.py
+++ b/backend/danswer/server/query_and_chat/query_backend.py
@@ -34,15 +34,11 @@
 from danswer.document_index.vespa.index import VespaIndex
 from danswer.one_shot_answer.answer_question import stream_search_answer
 from danswer.one_shot_answer.models import DirectQARequest
-from danswer.secondary_llm_flows.query_validation import get_query_answerability
-from danswer.secondary_llm_flows.query_validation import stream_query_answerability
 from danswer.server.query_and_chat.models import AdminSearchRequest
 from danswer.server.query_and_chat.models import AdminSearchResponse
 from danswer.server.query_and_chat.models import ChatSessionDetails
 from danswer.server.query_and_chat.models import ChatSessionsResponse
-from danswer.server.query_and_chat.models import QueryValidationResponse
 from danswer.server.query_and_chat.models import SearchSessionDetailResponse
-from danswer.server.query_and_chat.models import SimpleQueryRequest
 from danswer.server.query_and_chat.models import SourceTag
 from danswer.server.query_and_chat.models import TagResponse
 from danswer.server.query_and_chat.token_limit import check_token_rate_limits
@@ -135,18 +131,6 @@ def get_tags(
     return TagResponse(tags=server_tags)
 
 
-@basic_router.post("/query-validation")
-def query_validation(
-    simple_query: SimpleQueryRequest, _: User = Depends(current_user)
-) -> QueryValidationResponse:
-    # Note if weak model prompt is chosen, this check does not occur and will simply return that
-    # the query is valid, this is because weaker models cannot really handle this task well.
-    # Additionally, some weak model servers cannot handle concurrent inferences.
-    logger.notice(f"Validating query: {simple_query.query}")
-    reasoning, answerable = get_query_answerability(simple_query.query)
-    return QueryValidationResponse(reasoning=reasoning, answerable=answerable)
-
-
 @basic_router.get("/user-searches")
 def get_user_search_sessions(
     user: User | None = Depends(current_user),
@@ -247,21 +231,6 @@ def get_search_session(
     return response
 
 
-# NOTE No longer used, after search/chat redesign.
-# No search responses are answered with a conversational generative AI response
-@basic_router.post("/stream-query-validation")
-def stream_query_validation(
-    simple_query: SimpleQueryRequest, _: User = Depends(current_user)
-) -> StreamingResponse:
-    # Note if weak model prompt is chosen, this check does not occur and will simply return that
-    # the query is valid, this is because weaker models cannot really handle this task well.
-    # Additionally, some weak model servers cannot handle concurrent inferences.
-    logger.notice(f"Validating query: {simple_query.query}")
-    return StreamingResponse(
-        stream_query_answerability(simple_query.query), media_type="application/json"
-    )
-
-
 @basic_router.post("/stream-answer-with-quote")
 def get_answer_with_quote(
     query_request: DirectQARequest,