From 34151bd8c9c97f6461f8c5b540fd7ed30e8b2cd5 Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Mon, 10 Jun 2024 08:58:22 -0700 Subject: [PATCH 01/20] Add QA use cases --- docs/source/index.rst | 4 +- use_cases/question_answering/chatbot.ipynb | 194 +++++++++++++++++++ use_cases/question_answering/simple_qa.ipynb | 170 ++++++++++++++++ 3 files changed, 366 insertions(+), 2 deletions(-) create mode 100644 use_cases/question_answering/chatbot.ipynb create mode 100644 use_cases/question_answering/simple_qa.ipynb diff --git a/docs/source/index.rst b/docs/source/index.rst index bd575994..731e1612 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,8 +45,8 @@ You have a similar coding experience as PyTorch. Here is a side to side comparis .. code-block:: python - from core.component import Component, Generator - from components.model_client import OpenAIClient + from lightrag.core.component import Component, Generator + from lightrag.components.model_client import OpenAIClient class SimpleQA(Component): def __init__(self): diff --git a/use_cases/question_answering/chatbot.ipynb b/use_cases/question_answering/chatbot.ipynb new file mode 100644 index 00000000..6df298c0 --- /dev/null +++ b/use_cases/question_answering/chatbot.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build a ChatBot " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have built a simple question-answering pipeline, where we can input a question and get an answer. In addition to single round of question-answering, we can also have a conversation with an LLM by building a chatbot. The chatbot can remember the history of the conversation and respond based on the history. The key to achieve this is to leverage the promt args `chat_history_str` and the data structure `Memory` to manage the conversation history." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Import needed module from LightRAG\n", + "from lightrag.core.component import Component\n", + "from lightrag.core.generator import Generator\n", + "from lightrag.core.memory import Memory" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Here, we use the OpenAIClient as an example, but you can use any other clients (with the corresponding API Key as needed), such as AnthropicAPIClient\n", + "from lightrag.components.model_client import OpenAIClient\n", + "OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChatBot(\n", + " (generator): Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM\n", + " (system_prompt): Prompt(\n", + " template: {# task desc #}\n", + " {% if task_desc_str %}\n", + " {{task_desc_str}}\n", + " {% else %}\n", + " Answer user query.\n", + " {% endif %}\n", + " {# output format #}\n", + " {% if output_format_str %}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " {% endif %}\n", + " {# tools #}\n", + " {% if tools_str %}\n", + " \n", + " {{tools_str}}\n", + " \n", + " {% endif %}\n", + " {# example #}\n", + " {% if examples_str %}\n", + " \n", + " {{examples_str}}\n", + " \n", + " {% endif %}\n", + " {# chat history #}\n", + " {% if chat_history_str %}\n", + " \n", + " {{chat_history_str}}\n", + " \n", + " {% endif %}\n", + " {#contex#}\n", + " {% if context_str %}\n", + " \n", + " {{context_str}}\n", + " \n", + " {% endif %}\n", + " {# steps #}\n", + " {% if steps_str %}\n", + " \n", + " {{steps_str}}\n", + " \n", + " {% endif %}\n", + " {% if input_str %}\n", + " \n", + " {{input_str}}\n", + " \n", + " {% endif %}\n", + " {% if output_str %}\n", + " \n", + " {{output_str}}\n", + " \n", + " {% endif %}\n", + " , prompt_variables: ['context_str', 'task_desc_str', 'tools_str', 'chat_history_str', 'input_str', 'output_str', 'output_format_str', 'steps_str', 'examples_str']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " )\n", + " (chat_history): Memory()\n", + ")\n" + ] + } + ], + "source": [ + "# Build the ChatBot pipeline\n", + "class ChatBot(Component):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.generator = Generator(\n", + " model_client=OpenAIClient(),\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}\n", + " )\n", + " self.chat_history = Memory() # Memory to store the chat history\n", + "\n", + " def call(self, query):\n", + " return self.generator.call({'input_str': query})\n", + " \n", + " def call(self) -> str:\n", + " print(\"Welcome to the ChatBot. Type anything to chat. Type 'exit' to end.\")\n", + " while True:\n", + " user_input = input(\"You: \")\n", + " if user_input.lower() == \"exit\":\n", + " print(\"Goodbye!\")\n", + " break\n", + " chat_history_str = self.chat_history()\n", + " response = self.generator(\n", + " prompt_kwargs={\n", + " \"input_str\": user_input,\n", + " \"chat_history_str\": chat_history_str,\n", + " },\n", + " )\n", + " # save the user input and response to the memory\n", + " self.chat_history.add_dialog_turn(\n", + " user_query=user_input, assistant_response=response\n", + " )\n", + " print(f\"ChatBot: {response}\")\n", + "\n", + "chatbot = ChatBot()\n", + "print(chatbot)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to the ChatBot. Type anything to chat. Type 'exit' to end.\n", + "ChatBot: GeneratorOutput(data=\"Learning to drive can be an exciting and rewarding experience. Here are some general steps to help you get started on your journey to becoming a safe and confident driver:\\n\\n1. Get a learner's permit: In most places, you will need to obtain a learner's permit before you can start learning how to drive. Check with your local department of motor vehicles for the specific requirements in your area.\\n\\n2. Take a driver's education course: Consider enrolling in a driver's education course to learn the rules of the road and get some hands-on practice with a qualified instructor.\\n\\n3. Practice with a licensed driver: Before you can get your driver's license, you will need to log a certain number of supervised driving hours with a licensed adult. This is a great opportunity to get comfortable behind the wheel and practice your skills.\\n\\n4. Study the driver's manual: Make sure to familiarize yourself with the driver's manual for your state or country. It contains important information about traffic laws, road signs, and safe driving practices.\\n\\n5. Practice, practice, practice: The more time you spend behind the wheel, the more confident you will become as a driver. Practice in a variety of different conditions and situations to hone your skills.\\n\\n6. Take a driving test: Once you feel ready, schedule a driving test with your local department of motor vehicles. If you pass the test, you will receive your driver's license and be able to drive independently.\\n\\nRemember, learning to drive takes time and practice, so be patient with yourself and don't be afraid to ask for help if you need it. Good luck on your journey to becoming a licensed driver!\", error=None, raw_response=\"Learning to drive can be an exciting and rewarding experience. Here are some general steps to help you get started on your journey to becoming a safe and confident driver:\\n\\n1. Get a learner's permit: In most places, you will need to obtain a learner's permit before you can start learning how to drive. Check with your local department of motor vehicles for the specific requirements in your area.\\n\\n2. Take a driver's education course: Consider enrolling in a driver's education course to learn the rules of the road and get some hands-on practice with a qualified instructor.\\n\\n3. Practice with a licensed driver: Before you can get your driver's license, you will need to log a certain number of supervised driving hours with a licensed adult. This is a great opportunity to get comfortable behind the wheel and practice your skills.\\n\\n4. Study the driver's manual: Make sure to familiarize yourself with the driver's manual for your state or country. It contains important information about traffic laws, road signs, and safe driving practices.\\n\\n5. Practice, practice, practice: The more time you spend behind the wheel, the more confident you will become as a driver. Practice in a variety of different conditions and situations to hone your skills.\\n\\n6. Take a driving test: Once you feel ready, schedule a driving test with your local department of motor vehicles. If you pass the test, you will receive your driver's license and be able to drive independently.\\n\\nRemember, learning to drive takes time and practice, so be patient with yourself and don't be afraid to ask for help if you need it. Good luck on your journey to becoming a licensed driver!\")\n", + "ChatBot: GeneratorOutput(data=\"To get a driver's license in California, you can follow these general steps:\\n\\n1. Obtain a learner's permit: Applicants must be at least 15 and a half years old to apply for a learner's permit in California. You will need to pass a written knowledge test and a vision test to obtain your permit.\\n\\n2. Complete driver's education: If you are under 17 and a half years old, you must complete a driver's education course before applying for a provisional permit.\\n\\n3. Practice driving: With your learner's permit, you can start practicing driving with a licensed adult who is at least 25 years old.\\n\\n4. Apply for a provisional license: After holding your learner's permit for at least 6 months and completing at least 50 hours of practice (including 10 hours at night), you can apply for a provisional license.\\n\\n5. Pass the driving test: Schedule and pass a driving test at a local DMV office. Make sure to bring all required documents and fees.\\n\\n6. Receive your driver's license: If you pass the driving test, you will receive your provisional driver's license. With this license, you will have certain restrictions, such as driving with no passengers under 20 years old for the first year.\\n\\nRemember to check with the California Department of Motor Vehicles (DMV) for the most up-to-date and specific requirements for obtaining a driver's license in the state. Good luck with your journey to becoming a licensed driver in California!\", error=None, raw_response=\"To get a driver's license in California, you can follow these general steps:\\n\\n1. Obtain a learner's permit: Applicants must be at least 15 and a half years old to apply for a learner's permit in California. You will need to pass a written knowledge test and a vision test to obtain your permit.\\n\\n2. Complete driver's education: If you are under 17 and a half years old, you must complete a driver's education course before applying for a provisional permit.\\n\\n3. Practice driving: With your learner's permit, you can start practicing driving with a licensed adult who is at least 25 years old.\\n\\n4. Apply for a provisional license: After holding your learner's permit for at least 6 months and completing at least 50 hours of practice (including 10 hours at night), you can apply for a provisional license.\\n\\n5. Pass the driving test: Schedule and pass a driving test at a local DMV office. Make sure to bring all required documents and fees.\\n\\n6. Receive your driver's license: If you pass the driving test, you will receive your provisional driver's license. With this license, you will have certain restrictions, such as driving with no passengers under 20 years old for the first year.\\n\\nRemember to check with the California Department of Motor Vehicles (DMV) for the most up-to-date and specific requirements for obtaining a driver's license in the state. Good luck with your journey to becoming a licensed driver in California!\")\n", + "Goodbye!\n" + ] + } + ], + "source": [ + "chatbot.call()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lightrag-project", + "language": "python", + "name": "light-rag-project" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/use_cases/question_answering/simple_qa.ipynb b/use_cases/question_answering/simple_qa.ipynb new file mode 100644 index 00000000..0f9c721e --- /dev/null +++ b/use_cases/question_answering/simple_qa.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build A Simple Question-Answering Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this use case, we show how to build a simple question-answering pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Import needed module from LightRAG\n", + "from lightrag.core.component import Component\n", + "from lightrag.core.generator import Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Here, we use the OpenAIClient as an example, but you can use any other clients (with the corresponding API Key as needed), such as AnthropicAPIClient\n", + "from lightrag.components.model_client import OpenAIClient\n", + "OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SimpleQA(\n", + " (generator): Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM\n", + " (system_prompt): Prompt(\n", + " template: {# task desc #}\n", + " {% if task_desc_str %}\n", + " {{task_desc_str}}\n", + " {% else %}\n", + " Answer user query.\n", + " {% endif %}\n", + " {# output format #}\n", + " {% if output_format_str %}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " {% endif %}\n", + " {# tools #}\n", + " {% if tools_str %}\n", + " \n", + " {{tools_str}}\n", + " \n", + " {% endif %}\n", + " {# example #}\n", + " {% if examples_str %}\n", + " \n", + " {{examples_str}}\n", + " \n", + " {% endif %}\n", + " {# chat history #}\n", + " {% if chat_history_str %}\n", + " \n", + " {{chat_history_str}}\n", + " \n", + " {% endif %}\n", + " {#contex#}\n", + " {% if context_str %}\n", + " \n", + " {{context_str}}\n", + " \n", + " {% endif %}\n", + " {# steps #}\n", + " {% if steps_str %}\n", + " \n", + " {{steps_str}}\n", + " \n", + " {% endif %}\n", + " {% if input_str %}\n", + " \n", + " {{input_str}}\n", + " \n", + " {% endif %}\n", + " {% if output_str %}\n", + " \n", + " {{output_str}}\n", + " \n", + " {% endif %}\n", + " , prompt_variables: ['tools_str', 'examples_str', 'output_format_str', 'chat_history_str', 'context_str', 'task_desc_str', 'steps_str', 'input_str', 'output_str']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " )\n", + ")\n" + ] + } + ], + "source": [ + "# Build the SimpleQA pipeline\n", + "class SimpleQA(Component):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.generator = Generator(\n", + " model_client=OpenAIClient(),\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}\n", + " )\n", + "\n", + " def call(self, query):\n", + " return self.generator.call(prompt_kwargs={'input_str': query})\n", + "\n", + "simple_qa = SimpleQA()\n", + "print(simple_qa)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data='The capital of France is Paris.', error=None, raw_response='The capital of France is Paris.')\n" + ] + } + ], + "source": [ + "query = \"What is the capital of France?\"\n", + "response = simple_qa.call(query)\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lightrag-project", + "language": "python", + "name": "light-rag-project" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d6aa7285e1d1504560e02edf120d0aec3d3b77ab Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Tue, 11 Jun 2024 21:34:28 -0700 Subject: [PATCH 02/20] clean up --- lightrag/eval/llm_as_judge.py | 28 ++++++++++++++-------------- lightrag/tests/test_evaluators.py | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lightrag/eval/llm_as_judge.py b/lightrag/eval/llm_as_judge.py index 55646f66..676e1113 100644 --- a/lightrag/eval/llm_as_judge.py +++ b/lightrag/eval/llm_as_judge.py @@ -42,8 +42,8 @@ class DefaultLLMJudge(Component): __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query. - You can use any any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False. - + You can use any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False. + A call on the LLM judge equalize to _compute_single_item method. Args: @@ -59,7 +59,7 @@ def __init__( super().__init__() self.model_client = model_client if model_client is None: - log.info(f"model_client is None, default to OpenAIClient.") + log.info("model_client is None, default to OpenAIClient.") try: from lightrag.components.model_client import OpenAIClient except ImportError: @@ -82,8 +82,8 @@ def call( Args: question (str): Question string. - pred_answer (str): Predicted answer string. gt_answer (str): Ground truth answer string. + pred_answer (str): Predicted answer string. judgement_query (str): Judgement query string. Returns: @@ -126,7 +126,7 @@ class LLMasJudge: >>> judgement_query = "For the question, does the predicted answer contain the ground truth answer?" >>> llm_judge = LLMasJudge() >>> avg_judgement, judgement_list = llm_judge.compute( - questions, pred_answers, gt_answers, judgement_query + questions, gt_answers, pred_answers, judgement_query ) >>> avg_judgement 2 / 3 @@ -143,8 +143,8 @@ def __init__( def compute( self, questions: List[str], - pred_answers: List[str], gt_answers: List[str], + pred_answers: List[str], judgement_query: str, ) -> List[bool]: r""" @@ -152,19 +152,21 @@ def compute( Args: questions (List[str]): List of question strings. - pred_answers (List[str]): List of predicted answer strings. gt_answers (List[str]): List of ground truth answer strings. + pred_answers (List[str]): List of predicted answer strings. judgement_query (str): Judgement query string. Returns: - List[bool]: Judgement results. + tuple: + - float: Average judgement score. + - List[bool]: Judgement results for each query. """ judgement_list = [] - for question, pred_answer, gt_answer in zip( - questions, pred_answers, gt_answers + for question, gt_answer, pred_answer in zip( + questions, gt_answers, pred_answers ): judgement = self.llm_evaluator( - question, pred_answer, gt_answer, judgement_query + question, gt_answer, pred_answer, judgement_query ) judgement_list.append(judgement) @@ -172,8 +174,6 @@ def compute( if __name__ == "__main__": - from lightrag.utils import setup_env - from lightrag.components.model_client import OpenAIClient questions = [ "Is Beijing in China?", @@ -187,7 +187,7 @@ def compute( ) llm_judge = LLMasJudge() avg_judgement, judgement_list = llm_judge.compute( - questions, pred_answers, gt_answers, judgement_query + questions, gt_answers, pred_answers, judgement_query ) print(avg_judgement) print(judgement_list) diff --git a/lightrag/tests/test_evaluators.py b/lightrag/tests/test_evaluators.py index 694a534c..71b460b1 100644 --- a/lightrag/tests/test_evaluators.py +++ b/lightrag/tests/test_evaluators.py @@ -79,7 +79,7 @@ def test_llm_as_judge(): ) llm_judge = LLMasJudge() avg_judgement, judgement_list = llm_judge.compute( - questions, pred_answers, gt_answers, judgement_query + questions, gt_answers, pred_answers, judgement_query ) assert avg_judgement == 2 / 3 assert judgement_list == [True, True, False] From 50c9f7aad247db41c9d5e7cfda572994c3d44106 Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Tue, 11 Jun 2024 22:07:05 -0700 Subject: [PATCH 03/20] refine the eval guide --- docs/source/developer_notes/evaluation.rst | 46 +++++++++++++--------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/docs/source/developer_notes/evaluation.rst b/docs/source/developer_notes/evaluation.rst index a5f563ea..e2b86290 100644 --- a/docs/source/developer_notes/evaluation.rst +++ b/docs/source/developer_notes/evaluation.rst @@ -68,31 +68,39 @@ If you are interested in computing metrics such as accuracy, F1-score, ROUGE, BE If you are particulay interested in evaluating RAG (Retrieval-Augmented Generation) pipelines, we have several metrics available in LightRAG to assess both the quality of the retrieved context and the quality of the final generated answer. -- :class:`RetrieverEvaluator `: This evaluator is used to evaluate the performance of the retriever component of the RAG pipeline. It has metric functions to compute the recall and context relevance of the retriever. -- :class:`AnswerMacthEvaluator `: This evaluator is used to evaluate the performance of the generator component of the RAG pipeline. It has metric functions to compute the exact match and fuzzy match accuracy of the generated answer. -- :class:`LLMasJudge `: This evaluator uses an LLM to get the judgement of the predicted answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It has a metric function to compute the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers. +- :class:`RetrieverRecall `: This is used to evaluate the recall of the retriever component of the RAG pipeline. +- :class:`RetrieverRelevance `: This is used to evaluate the relevance of the retrieved context to the query. +- :class:`AnswerMatchAcc `: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers. +- :class:`LLMasJudge `: This uses an LLM to get the judgement of the generated answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It computes the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers. For example, you can use the following code snippet to compute the recall and relevance of the retriever component of the RAG pipeline for a single query. .. code-block:: python :linenos: - from eval.evaluators import RetrieverEvaluator - retrieved_context = "Apple is founded before Google." # Retrieved context - gt_context = ["Apple is founded in 1976.", - "Google is founded in 1998.", - "Apple is founded before Google."] # Ground truth context - retriever_evaluator = RetrieverEvaluator() # Initialize the RetrieverEvaluator - recall = retriever_evaluator.compute_recall_single_query( - retrieved_context, gt_context - ) # Compute the recall of the retriever - relevance = retriever_evaluator.compute_context_relevance_single_query( - retrieved_context, gt_context - ) # Compute the relevance of the retriever - print(f"Recall: {recall}, Relevance: {relevance}") - # Recall: 0.3333333333333333, Relevance: 1.0 - -For a more detailed instructions on how to use these evaluators to evaluate RAG pipelines, you can refer to the tutorial on :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`, where we provide a step-by-step guide on how to use these evaluators to evaluate a RAG pipeline on HotpotQA dataset. + from lightrag.eval import RetrieverRecall, RetrieverRelevance + retrieved_contexts = [ + "Apple is founded before Google.", + "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.", + ] + gt_contexts = [ + [ + "Apple is founded in 1976.", + "Google is founded in 1998.", + "Apple is founded before Google.", + ], + ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"], + ] + retriever_recall = RetrieverRecall() + avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts) # Compute the recall of the retriever + print(f"Recall: {avg_recall}, Recall List: {recall_list}") + # Recall: 0.6666666666666666, Recall List: [0.3333333333333333, 1.0] + retriever_relevance = RetrieverRelevance() + avg_relevance, relevance_list = retriever_relevance.compute(retrieved_contexts, gt_contexts) # Compute the relevance of the retriever + print(f"Relevance: {avg_relevance}, Relevance List: {relevance_list}") + # Relevance: 0.803030303030303, Relevance List: [1.0, 0.6060606060606061] + +For a more detailed instructions on how build and evaluate RAG pipelines, you can refer to the use case on :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`. If you intent to use metrics that are not available in the LightRAG library, you can also implement your own custom metric functions or use other libraries such as `RAGAS `_ to compute the desired metrics for evaluating RAG pipelines. From 354bcf271b8ad5ed5a2d66aef782912b5f3b1623 Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Sat, 15 Jun 2024 17:46:56 -0700 Subject: [PATCH 04/20] Added the RAG notebook (kernal crashed) --- use_cases/question_answering/chatbot.ipynb | 8 +- use_cases/question_answering/simple_qa.ipynb | 32 +- .../simple_rag.ipynb | 348 ++++++++++++++++++ .../simple_rag.yaml | 19 + 4 files changed, 389 insertions(+), 18 deletions(-) create mode 100644 use_cases/retrieval_augmented_generation/simple_rag.ipynb create mode 100644 use_cases/retrieval_augmented_generation/simple_rag.yaml diff --git a/use_cases/question_answering/chatbot.ipynb b/use_cases/question_answering/chatbot.ipynb index 6df298c0..20f2c94f 100644 --- a/use_cases/question_answering/chatbot.ipynb +++ b/use_cases/question_answering/chatbot.ipynb @@ -20,7 +20,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Import needed module from LightRAG\n", + "# Import needed modules from LightRAG\n", "from lightrag.core.component import Component\n", "from lightrag.core.generator import Generator\n", "from lightrag.core.memory import Memory" @@ -121,9 +121,6 @@ " model_kwargs={'model': 'gpt-3.5-turbo'}\n", " )\n", " self.chat_history = Memory() # Memory to store the chat history\n", - "\n", - " def call(self, query):\n", - " return self.generator.call({'input_str': query})\n", " \n", " def call(self) -> str:\n", " print(\"Welcome to the ChatBot. Type anything to chat. Type 'exit' to end.\")\n", @@ -133,13 +130,14 @@ " print(\"Goodbye!\")\n", " break\n", " chat_history_str = self.chat_history()\n", + " # Generate the response from the user input and chat history\n", " response = self.generator(\n", " prompt_kwargs={\n", " \"input_str\": user_input,\n", " \"chat_history_str\": chat_history_str,\n", " },\n", " )\n", - " # save the user input and response to the memory\n", + " # Save the user input and response to the memory\n", " self.chat_history.add_dialog_turn(\n", " user_query=user_input, assistant_response=response\n", " )\n", diff --git a/use_cases/question_answering/simple_qa.ipynb b/use_cases/question_answering/simple_qa.ipynb index 0f9c721e..15e649ac 100644 --- a/use_cases/question_answering/simple_qa.ipynb +++ b/use_cases/question_answering/simple_qa.ipynb @@ -16,18 +16,18 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "# Import needed module from LightRAG\n", + "# Import needed modules from LightRAG\n", "from lightrag.core.component import Component\n", "from lightrag.core.generator import Generator" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -47,13 +47,15 @@ "text": [ "SimpleQA(\n", " (generator): Generator(\n", - " model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM\n", - " (system_prompt): Prompt(\n", - " template: {# task desc #}\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}, \n", + " (prompt): Prompt(\n", + " template: \n", + " {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n", + " \n", + " {% endif %}\n", + " {# task desc #}\n", " {% if task_desc_str %}\n", " {{task_desc_str}}\n", - " {% else %}\n", - " Answer user query.\n", " {% endif %}\n", " {# output format #}\n", " {% if output_format_str %}\n", @@ -91,6 +93,9 @@ " {{steps_str}}\n", " \n", " {% endif %}\n", + " {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n", + " \n", + " {% endif %}\n", " {% if input_str %}\n", " \n", " {{input_str}}\n", @@ -101,7 +106,8 @@ " {{output_str}}\n", " \n", " {% endif %}\n", - " , prompt_variables: ['tools_str', 'examples_str', 'output_format_str', 'chat_history_str', 'context_str', 'task_desc_str', 'steps_str', 'input_str', 'output_str']\n", + " You:\n", + " , prompt_variables: ['task_desc_str', 'steps_str', 'output_str', 'chat_history_str', 'tools_str', 'output_format_str', 'examples_str', 'context_str', 'input_str']\n", " )\n", " (model_client): OpenAIClient()\n", " )\n", @@ -119,7 +125,7 @@ " model_kwargs={'model': 'gpt-3.5-turbo'}\n", " )\n", "\n", - " def call(self, query):\n", + " def call(self, query: str):\n", " return self.generator.call(prompt_kwargs={'input_str': query})\n", "\n", "simple_qa = SimpleQA()\n", @@ -128,14 +134,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "GeneratorOutput(data='The capital of France is Paris.', error=None, raw_response='The capital of France is Paris.')\n" + "GeneratorOutput(data='The capital of France is Paris.', error=None, usage=None, raw_response='The capital of France is Paris.')\n" ] } ], diff --git a/use_cases/retrieval_augmented_generation/simple_rag.ipynb b/use_cases/retrieval_augmented_generation/simple_rag.ipynb new file mode 100644 index 00000000..b1912af4 --- /dev/null +++ b/use_cases/retrieval_augmented_generation/simple_rag.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build a Simple Retrieval-Augmented Generation (RAG) Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this use case, we show how to build and evaluate a simple RAG pipeline with LightRAG. RAG (Retrieval-Augmented Generation) pipelines leverage a retriever to fetch relevant context from a knowledge base (e.g., a document database) which is then fed to an LLM generator with the query to produce the answer. This allows the model to generate more contextually relevant answers." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import needed modules, including modules for loading datasets, constructing a RAG pipeline, and evaluating the performance of the RAG pipeline.\n", + "import yaml\n", + "from typing import Any, List, Optional, Union\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "from lightrag.core.types import Document\n", + "from lightrag.core.component import Component, Sequential\n", + "from lightrag.core.embedder import Embedder\n", + "from lightrag.core.document_splitter import DocumentSplitter\n", + "from lightrag.core.data_components import (\n", + " RetrieverOutputToContextStr,\n", + " ToEmbeddings,\n", + ")\n", + "from lightrag.components.retriever import FAISSRetriever\n", + "from lightrag.core.generator import Generator\n", + "from lightrag.core.db import LocalDocumentDB\n", + "from lightrag.core.string_parser import JsonParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Here, we use the OpenAIClient in the Generator as an example, but you can use any other clients (with the corresponding API Key as needed)\n", + "from lightrag.components.model_client import OpenAIClient\n", + "OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Define the configuration for the RAG pipeline**. We load the configuration from a YAML file. This configuration specifies the components of the RAG pipeline, including the text_splitter, vectorizer, retriever, and generator." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'vectorizer': {'batch_size': 100, 'model_kwargs': {'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}}, 'retriever': {'top_k': 2}, 'generator': {'model': 'gpt-3.5-turbo', 'temperature': 0.3, 'stream': False}, 'text_splitter': {'split_by': 'sentence', 'chunk_size': 1, 'chunk_overlap': 0}}\n" + ] + } + ], + "source": [ + "# Define the configuration settings for the RAG pipeline.\n", + "with open(\"./simple_rag.yaml\", \"r\") as file:\n", + " settings = yaml.safe_load(file)\n", + "print(settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Load a dataset**. Here, We use the [HotpotQA](https://huggingface.co/datasets/hotpotqa/hotpot_qa) dataset as an example. Each data sample in HotpotQA has *question*, *answer*, *context* and *supporting_facts* selected from the whole context." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "example: {'id': '5a7a06935542990198eaf050', 'question': \"Which magazine was started first Arthur's Magazine or First for Women?\", 'answer': \"Arthur's Magazine\", 'type': 'comparison', 'level': 'medium', 'supporting_facts': {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}, 'context': {'title': ['Radio City (Indian radio station)', 'History of Albanian football', 'Echosmith', \"Women's colleges in the Southern United States\", 'First Arthur County Courthouse and Jail', \"Arthur's Magazine\", '2014–15 Ukrainian Hockey Championship', 'First for Women', 'Freeway Complex Fire', 'William Rast'], 'sentences': [[\"Radio City is India's first private FM radio station and was started on 3 July 2001.\", ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).', ' It plays Hindi, English and regional songs.', ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.', ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.', ' The Radio station currently plays a mix of Hindi and Regional music.', ' Abraham Thomas is the CEO of the company.'], ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.', \" This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) .\", ' Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946.', ' In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA.'], ['Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California.', ' Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016.', ' Echosmith started first as \"Ready Set Go!\"', ' until they signed to Warner Bros.', ' Records in May 2012.', ' They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia.', ' The song was Warner Bros.', \" Records' fifth-biggest-selling-digital song of 2014, with 1.3 million downloads sold.\", ' The band\\'s debut album, \"Talking Dreams\", was released on October 8, 2013.'], [\"Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States.\", \" Many started first as girls' seminaries or academies.\", ' Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women.', ' Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level.'], ['The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum.'], [\"Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.\", ' Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.', ' In May 1846 it was merged into \"Godey\\'s Lady\\'s Book\".'], ['The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship.', ' Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues.', ' Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014.', ' The regular season included just 12 rounds, where all the teams went to the semifinals.', ' In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk.'], [\"First for Women is a woman's magazine published by Bauer Media Group in the USA.\", ' The magazine was started in 1989.', ' It is based in Englewood Cliffs, New Jersey.', ' In 2011 the circulation of the magazine was 1,310,696 copies.'], ['The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California.', ' The fire started as two separate fires on November 15, 2008.', ' The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later.', ' These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda.'], ['William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala.', ' It is most known for their premium jeans.', ' On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line.', ' The label also produces other clothing items such as jackets and tops.', ' The company started first as a denim line, later evolving into a men’s and women’s clothing line.']]}}\n", + "ground truth context: {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mengliu/Library/Caches/pypoetry/virtualenvs/lightrag-project-OrKUABKc-py3.12/lib/python3.12/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n", + " table = cls._concat_blocks(blocks, axis=0)\n" + ] + } + ], + "source": [ + "# Load the HotpotQA dataset. We select a subset of the dataset for demonstration purposes.\n", + "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")\n", + "dataset = dataset[\"train\"].select(range(5))\n", + "print(f\"example: {dataset[0]}\")\n", + "print(f\"ground truth context: {dataset[0]['supporting_facts']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Define a simple RAG pipeline**. Define a RAG pipeline by specifying the key components, such as *vectorizer*, *retriever*, and *generator*. For more information on these components, refer to the developer notes." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# The defined RAG pipeline.\n", + "class RAG(Component):\n", + "\n", + " def __init__(self, settings: dict):\n", + " super().__init__()\n", + " self.vectorizer_settings = settings[\"vectorizer\"]\n", + " self.retriever_settings = settings[\"retriever\"]\n", + " self.generator_model_kwargs = settings[\"generator\"]\n", + " self.text_splitter_settings = settings[\"text_splitter\"]\n", + "\n", + " vectorizer = Embedder(\n", + " model_client=OpenAIClient(),\n", + " model_kwargs=self.vectorizer_settings[\"model_kwargs\"],\n", + " )\n", + "\n", + " text_splitter = DocumentSplitter(\n", + " split_by=self.text_splitter_settings[\"split_by\"],\n", + " split_length=self.text_splitter_settings[\"chunk_size\"],\n", + " split_overlap=self.text_splitter_settings[\"chunk_overlap\"],\n", + " )\n", + " self.data_transformer = Sequential(\n", + " text_splitter,\n", + " ToEmbeddings(\n", + " vectorizer=vectorizer,\n", + " batch_size=self.vectorizer_settings[\"batch_size\"],\n", + " ),\n", + " )\n", + " self.data_transformer_key = self.data_transformer._get_name()\n", + " # initialize retriever, which depends on the vectorizer too\n", + " self.retriever = FAISSRetriever(\n", + " top_k=self.retriever_settings[\"top_k\"],\n", + " dimensions=self.vectorizer_settings[\"model_kwargs\"][\"dimensions\"],\n", + " vectorizer=vectorizer,\n", + " )\n", + " self.retriever_output_processors = RetrieverOutputToContextStr(deduplicate=True)\n", + "\n", + " self.db = LocalDocumentDB()\n", + "\n", + " # initialize generator\n", + " self.generator = Generator(\n", + " preset_prompt_kwargs={\n", + " \"task_desc_str\": r\"\"\"\n", + " You are a helpful assistant.\n", + "\n", + " Your task is to answer the query that may or may not come with context information.\n", + " When context is provided, you should stick to the context and less on your prior knowledge to answer the query.\n", + "\n", + " Output JSON format:\n", + " {\n", + " \"answer\": \"The answer to the query\",\n", + " }\"\"\"\n", + " },\n", + " model_client=OpenAIClient(),\n", + " model_kwargs=self.generator_model_kwargs,\n", + " output_processors=JsonParser(),\n", + " )\n", + " self.tracking = {\"vectorizer\": {\"num_calls\": 0, \"num_tokens\": 0}}\n", + "\n", + " def build_index(self, documents: List[Document]):\n", + " self.db.load_documents(documents)\n", + " self.map_key = self.db.map_data()\n", + " print(f\"map_key: {self.map_key}\")\n", + " self.data_key = self.db.transform_data(self.data_transformer)\n", + " print(f\"data_key: {self.data_key}\")\n", + " self.transformed_documents = self.db.get_transformed_data(self.data_key)\n", + " self.retriever.build_index_from_documents(self.transformed_documents)\n", + "\n", + " def generate(self, query: str, context: Optional[str] = None) -> Any:\n", + " if not self.generator:\n", + " raise ValueError(\"Generator is not set\")\n", + "\n", + " prompt_kwargs = {\n", + " \"context_str\": context,\n", + " \"input_str\": query,\n", + " }\n", + " response = self.generator(prompt_kwargs=prompt_kwargs)\n", + " if response.error:\n", + " raise ValueError(f\"Error in generator: {response.error}\")\n", + " return response.data\n", + "\n", + " def call(self, query: str) -> Any:\n", + " retrieved_documents = self.retriever(query)\n", + " # fill in the document\n", + " for i, retriever_output in enumerate(retrieved_documents):\n", + " retrieved_documents[i].documents = [\n", + " self.transformed_documents[doc_index]\n", + " for doc_index in retriever_output.doc_indexes\n", + " ]\n", + " # convert all the documents to context string\n", + " context_str = self.retriever_output_processors(retrieved_documents)\n", + "\n", + " return self.generate(query, context=context_str), context_str" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run the RAG piepline for each example in the dataset, we need to first **build the index** and then **call the pipeline**. For each sample in the dataset, we create a list of documents to retrieve from according to its corresponding *context* in the dataset. Each document has a title and a list of sentences. We use the `Document` class from `lightrag.core.types` to represent each document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "map_key: lightrag.core.db.\n", + "start to split documents\n", + "splitted_doc: [Document(id=54e3ca94-8148-4c1a-8f00-57c3f5c0c074, text=Radio City is India's first private FM radio station and was started on 3 July 2001., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=21, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=858e5dba-30ad-4cee-9279-0b481ee14bd7, text= It broadcasts on 91., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=7, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=9aa39091-bdcc-4a7c-9606-3cd816f95d94, text=1 (earlier 91., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=7, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=ac16cabd-223c-4f3c-85e0-eb35b83ca922, text=0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003)., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=41, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=2eda759e-d97c-4184-84fe-5db4550347ca, text= It plays Hindi, English and regional songs., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=10, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=0e78080a-9235-49f1-bbcf-524b2ca34a09, text= It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=32, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=4c419002-253a-4235-a9a2-aa2ac64e3026, text= Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=26, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=1db85923-cdef-4a24-9e3a-7f9e2ff053a7, text=com that offers music related news, videos, songs, and other music-related features., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=17, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=aef124ad-d406-48d1-8321-f68d0b3705e2, text= The Radio station currently plays a mix of Hindi and Regional music., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=14, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=b2721c1b-0479-474e-b4ab-03e0c9aa9aca, text= Abraham Thomas is the CEO of the company., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=10, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=1667c8d3-f75e-4f1b-a97f-70dfdeafdefb, text=Football in Albania existed before the Albanian Football Federation (FSHF) was created., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=17, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=affe3c50-0635-4db6-ada5-140031dd919d, text= This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) ., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=55, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=2f6a73aa-f0e4-4e38-840c-8749718e09f7, text= Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=39, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=87caef3b-b4e9-4a35-9757-60eb8ece15c8, text= In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=34, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=754203bf-51db-43d3-8480-e55f36d4436c, text=Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California., meta_data={'title': 'Echosmith'}, estimated_num_tokens=23, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=e49ffc42-18bc-474d-b361-cefe3c89f288, text= Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016., meta_data={'title': 'Echosmith'}, estimated_num_tokens=37, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=80dc23f0-b219-4f40-8ad1-4fb8ef1aa9cc, text= Echosmith started first as \"Ready Set Go!\" until they signed to Warner Bros., meta_data={'title': 'Echosmith'}, estimated_num_tokens=20, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=f910e6e9-2d31-42fb-85ac-793f05596351, text= Records in May 2012., meta_data={'title': 'Echosmith'}, estimated_num_tokens=8, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=aeac71e5-0385-4d1f-9a8b-2d028a72c45a, text= They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia., meta_data={'title': 'Echosmith'}, estimated_num_tokens=60, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=b84d6161-4c5c-46e1-8821-141c6901ee41, text= The song was Warner Bros., meta_data={'title': 'Echosmith'}, estimated_num_tokens=7, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=a3b12aaa-b5f9-482c-8d96-6e1cf3bc1022, text= Records' fifth-biggest-selling-digital song of 2014, with 1., meta_data={'title': 'Echosmith'}, estimated_num_tokens=19, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=0c854314-7baf-4e65-b8d3-5c1f6b5a0eb6, text=3 million downloads sold., meta_data={'title': 'Echosmith'}, estimated_num_tokens=5, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=4ab4405f-9620-4e55-a62c-67afa651f866, text= The band's debut album, \"Talking Dreams\", was released on October 8, 2013., meta_data={'title': 'Echosmith'}, estimated_num_tokens=22, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=31e4d4aa-c52b-4815-b7b9-dc428adaa172, text=Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=43, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=7f4f34df-f4cf-42a4-bd6d-8f6023fab1c5, text= Many started first as girls' seminaries or academies., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=14, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=9ae0bed3-570d-4e10-a95f-259476f677fb, text= Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=29, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=b2cca951-04c8-4601-a873-79b6744939b8, text= Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=23, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=faa34193-9973-45f5-a7d5-788da8aa4d60, text=The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum., meta_data={'title': 'First Arthur County Courthouse and Jail'}, estimated_num_tokens=27, parent_doc_id=44543f35-9b5b-41dc-a4e9-6cadc4ecc9d5), Document(id=b1678550-4a22-44a0-8487-ab9e626189aa, text=Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=26, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=7ef7d4fe-5510-46ed-bd65-afa15ae5fc77, text= Edited by T., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=5, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=ee831115-8460-44b3-ac8b-7e681dd3e6a1, text=S., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=2, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=a06d4866-6c5a-4264-bf00-5a4d2fef0e2c, text= Arthur, it featured work by Edgar A., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=9, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=9c672c7b-0f63-4c4c-a2a7-7af6b588c08f, text= Poe, J., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=4, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=15677695-c18a-46a2-a99e-f8a73db1bb99, text=H., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=2, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=7b9854c3-4174-4e11-babd-0f0c3510c84b, text= Ingraham, Sarah Josepha Hale, Thomas G., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=13, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=87e1f254-46ac-48f7-b073-b0bdeca754ec, text= Spear, and others., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=5, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=9795985e-b5f6-4aa7-8455-965a3dc11d6c, text= In May 1846 it was merged into \"Godey's Lady's Book\"., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=19, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=0fcfd512-0155-4eec-bf2c-de1b0a6a654e, text=The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=21, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=d9253bf0-fd55-4703-980e-6af4d21d29f8, text= Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=27, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=5df4bbcc-9258-4239-ab5c-9f04b5d62d9a, text= Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=31, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=b0f067b0-3813-41a1-8241-45a628cf1ecf, text= The regular season included just 12 rounds, where all the teams went to the semifinals., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=20, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=23bfb91f-3bf6-4d8a-91c6-cbfbb82b9aee, text= In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=20, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=7a039ae2-cf6a-49b9-ada2-ebc496a2f8cf, text=First for Women is a woman's magazine published by Bauer Media Group in the USA., meta_data={'title': 'First for Women'}, estimated_num_tokens=17, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=52a20932-9738-4dc1-a9fb-895fba700879, text= The magazine was started in 1989., meta_data={'title': 'First for Women'}, estimated_num_tokens=10, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=f062e075-512d-49dd-abe0-bafbebc9dc0c, text= It is based in Englewood Cliffs, New Jersey., meta_data={'title': 'First for Women'}, estimated_num_tokens=14, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=213de99b-f082-4d32-a553-27611b97c78a, text= In 2011 the circulation of the magazine was 1,310,696 copies., meta_data={'title': 'First for Women'}, estimated_num_tokens=19, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=708ae365-5f5d-400c-827e-7b48a12a938d, text=The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=23, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=c9513000-fca5-48cf-9f69-e7931d3fed6a, text= The fire started as two separate fires on November 15, 2008., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=17, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=bcf96704-624a-48f2-875c-8000ff2c55b4, text= The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=29, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=12980697-3246-4bbc-a076-10f7187c8bea, text= These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=24, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=f55ea350-85e5-4e1c-95a2-1f58baebfc8d, text=William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala., meta_data={'title': 'William Rast'}, estimated_num_tokens=18, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=456503d3-b061-4fdf-96ba-eb967f490f89, text= It is most known for their premium jeans., meta_data={'title': 'William Rast'}, estimated_num_tokens=10, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=7bb89815-0734-4977-8caf-7e1076e351ae, text= On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line., meta_data={'title': 'William Rast'}, estimated_num_tokens=33, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=70e9204e-910d-496e-a200-dbba6764a345, text= The label also produces other clothing items such as jackets and tops., meta_data={'title': 'William Rast'}, estimated_num_tokens=14, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=b8c0f1df-c383-4ad7-bf7e-b9eef7c38bb7, text= The company started first as a denim line, later evolving into a men’s and women’s clothing line., meta_data={'title': 'William Rast'}, estimated_num_tokens=22, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad)]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 1.84it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_key: Sequential_transformed\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", + "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", + "\u001b[1;31mClick here for more info. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], + "source": [ + "# To get the ground truth context string from the supporting_facts filed in HotpotQA. This function is specific to the HotpotQA dataset.\n", + "def get_supporting_sentences(\n", + " supporting_facts: dict[str, list[Union[str, int]]], context: dict[str, list[str]]\n", + ") -> List[str]:\n", + " \"\"\"\n", + " Extract the supporting sentences from the context based on the supporting facts.\n", + " \"\"\"\n", + " extracted_sentences = []\n", + " for title, sent_id in zip(supporting_facts[\"title\"], supporting_facts[\"sent_id\"]):\n", + " if title in context[\"title\"]:\n", + " index = context[\"title\"].index(title)\n", + " sentence = context[\"sentences\"][index][sent_id]\n", + " extracted_sentences.append(sentence)\n", + " return extracted_sentences\n", + "\n", + "\n", + "all_questions = []\n", + "all_retrieved_context = []\n", + "all_gt_context = []\n", + "all_pred_answer = []\n", + "all_gt_answer = []\n", + "for data in dataset:\n", + " # build the document list\n", + " num_docs = len(data[\"context\"][\"title\"])\n", + " doc_list = [\n", + " Document(\n", + " meta_data={\"title\": data[\"context\"][\"title\"][i]},\n", + " text=\" \".join(data[\"context\"][\"sentences\"][i]),\n", + " )\n", + " for i in range(num_docs)\n", + " ]\n", + " # rag = RAG(settings)\n", + " # # build the index\n", + " # rag.build_index(doc_list)\n", + " # # call the pipeline\n", + " # query = data[\"question\"]\n", + " # response, context_str = rag.call(query)\n", + " # import ipdb; ipdb.set_trace()\n", + " # gt_context_sentence_list = get_supporting_sentences(\n", + " # data[\"supporting_facts\"], data[\"context\"]\n", + " # )\n", + " # all_questions.append(query)\n", + " # all_retrieved_context.append(context_str)\n", + " # all_gt_context.append(gt_context_sentence_list)\n", + " # all_pred_answer.append(response[\"answer\"])\n", + " # all_gt_answer.append(data[\"answer\"])\n", + " # print(f\"query: {query}\")\n", + " # print(f\"response: {response['answer']}\")\n", + " # print(f\"ground truth response: {data['answer']}\")\n", + " # print(f\"context_str: {context_str}\")\n", + " # print(f\"ground truth context_str: {gt_context_sentence_list}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lightrag-project", + "language": "python", + "name": "light-rag-project" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/use_cases/retrieval_augmented_generation/simple_rag.yaml b/use_cases/retrieval_augmented_generation/simple_rag.yaml new file mode 100644 index 00000000..c02b61b8 --- /dev/null +++ b/use_cases/retrieval_augmented_generation/simple_rag.yaml @@ -0,0 +1,19 @@ +vectorizer: + batch_size: 100 + model_kwargs: + model: text-embedding-3-small + dimensions: 256 + encoding_format: float + +retriever: + top_k: 2 + +generator: + model: gpt-3.5-turbo + temperature: 0.3 + stream: false + +text_splitter: + split_by: sentence + chunk_size: 1 + chunk_overlap: 0 From 7cc76af378af4e603042f02234e8f75743484ccf Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Sun, 16 Jun 2024 16:49:22 -0700 Subject: [PATCH 05/20] Add simple RAG notebook --- lightrag/tests/test_evaluators.py | 6 +- .../simple_rag.ipynb | 238 ++++++++++++------ 2 files changed, 159 insertions(+), 85 deletions(-) diff --git a/lightrag/tests/test_evaluators.py b/lightrag/tests/test_evaluators.py index 71b460b1..fb0636f5 100644 --- a/lightrag/tests/test_evaluators.py +++ b/lightrag/tests/test_evaluators.py @@ -13,9 +13,9 @@ def test_answer_match_acc(): pred_answers = ["positive", "negative", "this is neutral"] gt_answers = ["positive", "negative", "neutral"] answer_match_acc = AnswerMatchAcc(type="exact_match") - result, result_list = answer_match_acc.compute(pred_answers, gt_answers) - assert result == 2 / 3 - assert result_list == [1.0, 1.0, 0.0] + avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers) + assert avg_acc == 2 / 3 + assert acc_list == [1.0, 1.0, 0.0] answer_match_acc = AnswerMatchAcc(type="fuzzy_match") avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers) assert avg_acc == 1.0 diff --git a/use_cases/retrieval_augmented_generation/simple_rag.ipynb b/use_cases/retrieval_augmented_generation/simple_rag.ipynb index b1912af4..d5676a81 100644 --- a/use_cases/retrieval_augmented_generation/simple_rag.ipynb +++ b/use_cases/retrieval_augmented_generation/simple_rag.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -37,18 +37,40 @@ "from lightrag.components.retriever import FAISSRetriever\n", "from lightrag.core.generator import Generator\n", "from lightrag.core.db import LocalDocumentDB\n", - "from lightrag.core.string_parser import JsonParser" + "from lightrag.core.string_parser import JsonParser\n", + "\n", + "from lightrag.eval import (\n", + " AnswerMatchAcc,\n", + " RetrieverRecall,\n", + " RetrieverRelevance,\n", + " LLMasJudge,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Here, we use the OpenAIClient in the Generator as an example, but you can use any other clients (with the corresponding API Key as needed)\n", "from lightrag.components.model_client import OpenAIClient\n", - "OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file" + "# OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file\n", + "OPENAI_API_KEY=\"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\" # Example API Key\n", + "import dotenv\n", + "# load evironment\n", + "dotenv.load_dotenv(dotenv_path=\".env\", override=True)" ] }, { @@ -60,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -87,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -97,22 +119,14 @@ "example: {'id': '5a7a06935542990198eaf050', 'question': \"Which magazine was started first Arthur's Magazine or First for Women?\", 'answer': \"Arthur's Magazine\", 'type': 'comparison', 'level': 'medium', 'supporting_facts': {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}, 'context': {'title': ['Radio City (Indian radio station)', 'History of Albanian football', 'Echosmith', \"Women's colleges in the Southern United States\", 'First Arthur County Courthouse and Jail', \"Arthur's Magazine\", '2014–15 Ukrainian Hockey Championship', 'First for Women', 'Freeway Complex Fire', 'William Rast'], 'sentences': [[\"Radio City is India's first private FM radio station and was started on 3 July 2001.\", ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).', ' It plays Hindi, English and regional songs.', ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.', ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.', ' The Radio station currently plays a mix of Hindi and Regional music.', ' Abraham Thomas is the CEO of the company.'], ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.', \" This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) .\", ' Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946.', ' In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA.'], ['Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California.', ' Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016.', ' Echosmith started first as \"Ready Set Go!\"', ' until they signed to Warner Bros.', ' Records in May 2012.', ' They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia.', ' The song was Warner Bros.', \" Records' fifth-biggest-selling-digital song of 2014, with 1.3 million downloads sold.\", ' The band\\'s debut album, \"Talking Dreams\", was released on October 8, 2013.'], [\"Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States.\", \" Many started first as girls' seminaries or academies.\", ' Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women.', ' Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level.'], ['The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum.'], [\"Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.\", ' Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.', ' In May 1846 it was merged into \"Godey\\'s Lady\\'s Book\".'], ['The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship.', ' Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues.', ' Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014.', ' The regular season included just 12 rounds, where all the teams went to the semifinals.', ' In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk.'], [\"First for Women is a woman's magazine published by Bauer Media Group in the USA.\", ' The magazine was started in 1989.', ' It is based in Englewood Cliffs, New Jersey.', ' In 2011 the circulation of the magazine was 1,310,696 copies.'], ['The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California.', ' The fire started as two separate fires on November 15, 2008.', ' The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later.', ' These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda.'], ['William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala.', ' It is most known for their premium jeans.', ' On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line.', ' The label also produces other clothing items such as jackets and tops.', ' The company started first as a denim line, later evolving into a men’s and women’s clothing line.']]}}\n", "ground truth context: {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/mengliu/Library/Caches/pypoetry/virtualenvs/lightrag-project-OrKUABKc-py3.12/lib/python3.12/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n", - " table = cls._concat_blocks(blocks, axis=0)\n" - ] } ], "source": [ "# Load the HotpotQA dataset. We select a subset of the dataset for demonstration purposes.\n", "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")\n", - "dataset = dataset[\"train\"].select(range(5))\n", - "print(f\"example: {dataset[0]}\")\n", - "print(f\"ground truth context: {dataset[0]['supporting_facts']}\")" + "selected_dataset = dataset[\"train\"].select(range(5))\n", + "print(f\"example: {selected_dataset[0]}\")\n", + "print(f\"ground truth context: {selected_dataset[0]['supporting_facts']}\")" ] }, { @@ -124,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -226,49 +240,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To run the RAG piepline for each example in the dataset, we need to first **build the index** and then **call the pipeline**. For each sample in the dataset, we create a list of documents to retrieve from according to its corresponding *context* in the dataset. Each document has a title and a list of sentences. We use the `Document` class from `lightrag.core.types` to represent each document." + "To run the RAG piepline for each example in the dataset, we need to first **build the index** and then **call the pipeline**. For each sample in the dataset, we create a list of documents to retrieve from, according to its corresponding *context* in the dataset. Each document has a title and a list of sentences. We use the `Document` class from `lightrag.core.types` to represent each document." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "map_key: lightrag.core.db.\n", - "start to split documents\n", - "splitted_doc: [Document(id=54e3ca94-8148-4c1a-8f00-57c3f5c0c074, text=Radio City is India's first private FM radio station and was started on 3 July 2001., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=21, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=858e5dba-30ad-4cee-9279-0b481ee14bd7, text= It broadcasts on 91., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=7, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=9aa39091-bdcc-4a7c-9606-3cd816f95d94, text=1 (earlier 91., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=7, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=ac16cabd-223c-4f3c-85e0-eb35b83ca922, text=0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003)., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=41, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=2eda759e-d97c-4184-84fe-5db4550347ca, text= It plays Hindi, English and regional songs., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=10, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=0e78080a-9235-49f1-bbcf-524b2ca34a09, text= It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=32, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=4c419002-253a-4235-a9a2-aa2ac64e3026, text= Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=26, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=1db85923-cdef-4a24-9e3a-7f9e2ff053a7, text=com that offers music related news, videos, songs, and other music-related features., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=17, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=aef124ad-d406-48d1-8321-f68d0b3705e2, text= The Radio station currently plays a mix of Hindi and Regional music., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=14, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=b2721c1b-0479-474e-b4ab-03e0c9aa9aca, text= Abraham Thomas is the CEO of the company., meta_data={'title': 'Radio City (Indian radio station)'}, estimated_num_tokens=10, parent_doc_id=a69d50f7-6186-446a-9952-b9975da778ae), Document(id=1667c8d3-f75e-4f1b-a97f-70dfdeafdefb, text=Football in Albania existed before the Albanian Football Federation (FSHF) was created., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=17, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=affe3c50-0635-4db6-ada5-140031dd919d, text= This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) ., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=55, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=2f6a73aa-f0e4-4e38-840c-8749718e09f7, text= Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=39, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=87caef3b-b4e9-4a35-9757-60eb8ece15c8, text= In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA., meta_data={'title': 'History of Albanian football'}, estimated_num_tokens=34, parent_doc_id=5471d81d-0ef0-47a1-b683-899df5da9627), Document(id=754203bf-51db-43d3-8480-e55f36d4436c, text=Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California., meta_data={'title': 'Echosmith'}, estimated_num_tokens=23, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=e49ffc42-18bc-474d-b361-cefe3c89f288, text= Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016., meta_data={'title': 'Echosmith'}, estimated_num_tokens=37, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=80dc23f0-b219-4f40-8ad1-4fb8ef1aa9cc, text= Echosmith started first as \"Ready Set Go!\" until they signed to Warner Bros., meta_data={'title': 'Echosmith'}, estimated_num_tokens=20, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=f910e6e9-2d31-42fb-85ac-793f05596351, text= Records in May 2012., meta_data={'title': 'Echosmith'}, estimated_num_tokens=8, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=aeac71e5-0385-4d1f-9a8b-2d028a72c45a, text= They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia., meta_data={'title': 'Echosmith'}, estimated_num_tokens=60, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=b84d6161-4c5c-46e1-8821-141c6901ee41, text= The song was Warner Bros., meta_data={'title': 'Echosmith'}, estimated_num_tokens=7, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=a3b12aaa-b5f9-482c-8d96-6e1cf3bc1022, text= Records' fifth-biggest-selling-digital song of 2014, with 1., meta_data={'title': 'Echosmith'}, estimated_num_tokens=19, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=0c854314-7baf-4e65-b8d3-5c1f6b5a0eb6, text=3 million downloads sold., meta_data={'title': 'Echosmith'}, estimated_num_tokens=5, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=4ab4405f-9620-4e55-a62c-67afa651f866, text= The band's debut album, \"Talking Dreams\", was released on October 8, 2013., meta_data={'title': 'Echosmith'}, estimated_num_tokens=22, parent_doc_id=5df4835f-8312-4c26-abac-71c5a2aa24c8), Document(id=31e4d4aa-c52b-4815-b7b9-dc428adaa172, text=Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=43, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=7f4f34df-f4cf-42a4-bd6d-8f6023fab1c5, text= Many started first as girls' seminaries or academies., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=14, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=9ae0bed3-570d-4e10-a95f-259476f677fb, text= Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=29, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=b2cca951-04c8-4601-a873-79b6744939b8, text= Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level., meta_data={'title': \"Women's colleges in the Southern United States\"}, estimated_num_tokens=23, parent_doc_id=51700d62-4ef3-4473-a23d-ee44ea36c384), Document(id=faa34193-9973-45f5-a7d5-788da8aa4d60, text=The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum., meta_data={'title': 'First Arthur County Courthouse and Jail'}, estimated_num_tokens=27, parent_doc_id=44543f35-9b5b-41dc-a4e9-6cadc4ecc9d5), Document(id=b1678550-4a22-44a0-8487-ab9e626189aa, text=Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=26, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=7ef7d4fe-5510-46ed-bd65-afa15ae5fc77, text= Edited by T., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=5, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=ee831115-8460-44b3-ac8b-7e681dd3e6a1, text=S., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=2, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=a06d4866-6c5a-4264-bf00-5a4d2fef0e2c, text= Arthur, it featured work by Edgar A., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=9, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=9c672c7b-0f63-4c4c-a2a7-7af6b588c08f, text= Poe, J., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=4, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=15677695-c18a-46a2-a99e-f8a73db1bb99, text=H., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=2, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=7b9854c3-4174-4e11-babd-0f0c3510c84b, text= Ingraham, Sarah Josepha Hale, Thomas G., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=13, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=87e1f254-46ac-48f7-b073-b0bdeca754ec, text= Spear, and others., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=5, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=9795985e-b5f6-4aa7-8455-965a3dc11d6c, text= In May 1846 it was merged into \"Godey's Lady's Book\"., meta_data={'title': \"Arthur's Magazine\"}, estimated_num_tokens=19, parent_doc_id=6db097de-1c4f-410f-8777-cdca18fa67e5), Document(id=0fcfd512-0155-4eec-bf2c-de1b0a6a654e, text=The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=21, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=d9253bf0-fd55-4703-980e-6af4d21d29f8, text= Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=27, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=5df4bbcc-9258-4239-ab5c-9f04b5d62d9a, text= Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=31, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=b0f067b0-3813-41a1-8241-45a628cf1ecf, text= The regular season included just 12 rounds, where all the teams went to the semifinals., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=20, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=23bfb91f-3bf6-4d8a-91c6-cbfbb82b9aee, text= In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk., meta_data={'title': '2014–15 Ukrainian Hockey Championship'}, estimated_num_tokens=20, parent_doc_id=c5ea8410-0200-4a87-8e5a-fe4b29cebdfe), Document(id=7a039ae2-cf6a-49b9-ada2-ebc496a2f8cf, text=First for Women is a woman's magazine published by Bauer Media Group in the USA., meta_data={'title': 'First for Women'}, estimated_num_tokens=17, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=52a20932-9738-4dc1-a9fb-895fba700879, text= The magazine was started in 1989., meta_data={'title': 'First for Women'}, estimated_num_tokens=10, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=f062e075-512d-49dd-abe0-bafbebc9dc0c, text= It is based in Englewood Cliffs, New Jersey., meta_data={'title': 'First for Women'}, estimated_num_tokens=14, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=213de99b-f082-4d32-a553-27611b97c78a, text= In 2011 the circulation of the magazine was 1,310,696 copies., meta_data={'title': 'First for Women'}, estimated_num_tokens=19, parent_doc_id=35fc4eb7-7708-4e40-b045-4c73acb2204a), Document(id=708ae365-5f5d-400c-827e-7b48a12a938d, text=The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=23, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=c9513000-fca5-48cf-9f69-e7931d3fed6a, text= The fire started as two separate fires on November 15, 2008., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=17, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=bcf96704-624a-48f2-875c-8000ff2c55b4, text= The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=29, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=12980697-3246-4bbc-a076-10f7187c8bea, text= These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda., meta_data={'title': 'Freeway Complex Fire'}, estimated_num_tokens=24, parent_doc_id=1197b099-643e-45a2-89a3-4ecd0708a648), Document(id=f55ea350-85e5-4e1c-95a2-1f58baebfc8d, text=William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala., meta_data={'title': 'William Rast'}, estimated_num_tokens=18, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=456503d3-b061-4fdf-96ba-eb967f490f89, text= It is most known for their premium jeans., meta_data={'title': 'William Rast'}, estimated_num_tokens=10, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=7bb89815-0734-4977-8caf-7e1076e351ae, text= On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line., meta_data={'title': 'William Rast'}, estimated_num_tokens=33, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=70e9204e-910d-496e-a200-dbba6764a345, text= The label also produces other clothing items such as jackets and tops., meta_data={'title': 'William Rast'}, estimated_num_tokens=14, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad), Document(id=b8c0f1df-c383-4ad7-bf7e-b9eef7c38bb7, text= The company started first as a denim line, later evolving into a men’s and women’s clothing line., meta_data={'title': 'William Rast'}, estimated_num_tokens=22, parent_doc_id=c16062a0-b92d-40ce-b8b1-7f98c5499aad)]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1/1 [00:00<00:00, 1.84it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data_key: Sequential_transformed\n" - ] - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "# To get the ground truth context string from the supporting_facts filed in HotpotQA. This function is specific to the HotpotQA dataset.\n", "def get_supporting_sentences(\n", @@ -286,12 +265,12 @@ " return extracted_sentences\n", "\n", "\n", - "all_questions = []\n", - "all_retrieved_context = []\n", - "all_gt_context = []\n", - "all_pred_answer = []\n", - "all_gt_answer = []\n", - "for data in dataset:\n", + "questions = []\n", + "retrieved_contexts = []\n", + "gt_contexts = []\n", + "pred_answers = []\n", + "gt_answers = []\n", + "for data in selected_dataset:\n", " # build the document list\n", " num_docs = len(data[\"context\"][\"title\"])\n", " doc_list = [\n", @@ -301,26 +280,121 @@ " )\n", " for i in range(num_docs)\n", " ]\n", - " # rag = RAG(settings)\n", - " # # build the index\n", - " # rag.build_index(doc_list)\n", - " # # call the pipeline\n", - " # query = data[\"question\"]\n", - " # response, context_str = rag.call(query)\n", - " # import ipdb; ipdb.set_trace()\n", - " # gt_context_sentence_list = get_supporting_sentences(\n", - " # data[\"supporting_facts\"], data[\"context\"]\n", - " # )\n", - " # all_questions.append(query)\n", - " # all_retrieved_context.append(context_str)\n", - " # all_gt_context.append(gt_context_sentence_list)\n", - " # all_pred_answer.append(response[\"answer\"])\n", - " # all_gt_answer.append(data[\"answer\"])\n", - " # print(f\"query: {query}\")\n", - " # print(f\"response: {response['answer']}\")\n", - " # print(f\"ground truth response: {data['answer']}\")\n", - " # print(f\"context_str: {context_str}\")\n", - " # print(f\"ground truth context_str: {gt_context_sentence_list}\")\n" + " rag = RAG(settings)\n", + " # build the index\n", + " rag.build_index(doc_list)\n", + " # call the pipeline\n", + " query = data[\"question\"]\n", + " response, context_str = rag.call(query)\n", + " gt_context_sentence_list = get_supporting_sentences(\n", + " data[\"supporting_facts\"], data[\"context\"]\n", + " )\n", + " questions.append(query)\n", + " retrieved_contexts.append(context_str)\n", + " gt_contexts.append(gt_context_sentence_list)\n", + " pred_answers.append(response[\"answer\"])\n", + " gt_answers.append(data[\"answer\"])\n", + " print(f\"query: {query}\")\n", + " print(f\"response: {response['answer']}\")\n", + " print(f\"ground truth response: {data['answer']}\")\n", + " print(f\"context_str: {context_str}\")\n", + " print(f\"ground truth context_str: {gt_context_sentence_list}\")\n", + " break\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Evaluate the performance of the RAG pipeline**. We first evaluate the performance of the retriever component by calculating the *recall* of the retrieved context and the *relevance* score of the retrieved context." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'retrieved_contexts' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[34], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Compute the recall.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m retriever_recall \u001b[38;5;241m=\u001b[39m RetrieverRecall()\n\u001b[0;32m----> 3\u001b[0m avg_recall, recall_list \u001b[38;5;241m=\u001b[39m retriever_recall\u001b[38;5;241m.\u001b[39mcompute(\u001b[43mretrieved_contexts\u001b[49m, gt_contexts)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maverage recall: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mavg_recall\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecall list: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrecall_list\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'retrieved_contexts' is not defined" + ] + } + ], + "source": [ + "# Compute the recall.\n", + "retriever_recall = RetrieverRecall()\n", + "avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts)\n", + "print(f\"average recall: {avg_recall}\")\n", + "print(f\"recall list: {recall_list}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the relevance.\n", + "retriever_relevance = RetrieverRelevance()\n", + "avg_relevance, relevance_list = retriever_relevance.compute(\n", + " retrieved_contexts, gt_contexts\n", + ")\n", + "print(f\"average relevance: {avg_relevance}\")\n", + "print(f\"relevance list: {relevance_list}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we evaluate the generated answers using the AnswerMatchAcc metric, which compares the predicted answer with the ground truth answer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the answer match accuracy.\n", + "answer_match_acc = AnswerMatchAcc(type=\"exact_match\")\n", + "avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers)\n", + "print(f\"average accuracy: {avg_acc}\")\n", + "print(f\"accuracy list: {acc_list}\")\n", + "answer_match_acc = AnswerMatchAcc(type=\"fuzzy_match\")\n", + "avg_acc, acc_list = answer_match_acc.compute(pred_answers, gt_answers)\n", + "print(f\"average accuracy: {avg_acc}\")\n", + "print(f\"accuracy list: {acc_list}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finally use an LLM as the judge for evaluating the performance. The task description in the `DEFAULT_LLM_EVALUATOR_PROMPT` is \"You are a helpful assistant. Given the question, ground truth answer, and predicted answer, you need to answer the judgement query. Output True or False according to the judgement query.\" You can customize the task description as needed. See the `lightrag.eval.LLMasJudge` class for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_judge = LLMasJudge()\n", + "judgement_query = (\n", + " \"For the question, does the predicted answer contain the ground truth answer?\"\n", + " )\n", + "avg_judgement, judgement_list = llm_judge.compute(\n", + " questions, gt_answers, pred_answers, judgement_query\n", + ")\n", + "print(f\"average judgement: {avg_judgement}\")\n", + "print(f\"judgement list: {judgement_list}\")" ] } ], From d81326c7d3357be5e949539bba1ed6f3545920de Mon Sep 17 00:00:00 2001 From: mengliu1998 <604629@gmail.com> Date: Sun, 16 Jun 2024 22:35:52 -0700 Subject: [PATCH 06/20] fixed a setup --- .../simple_rag.ipynb | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/use_cases/retrieval_augmented_generation/simple_rag.ipynb b/use_cases/retrieval_augmented_generation/simple_rag.ipynb index d5676a81..19a70726 100644 --- a/use_cases/retrieval_augmented_generation/simple_rag.ipynb +++ b/use_cases/retrieval_augmented_generation/simple_rag.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -58,7 +58,7 @@ "False" ] }, - "execution_count": 30, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -66,8 +66,8 @@ "source": [ "# Here, we use the OpenAIClient in the Generator as an example, but you can use any other clients (with the corresponding API Key as needed)\n", "from lightrag.components.model_client import OpenAIClient\n", - "# OPENAI_API_KEY=\"YOUR_API_KEY\" # Replace with your OpenAI API Key, or you can put it in a .env file\n", - "OPENAI_API_KEY=\"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\" # Example API Key\n", + "import os\n", + "os.environ[\"KMP_DUPLICATE_LIB_OK\"] = \"True\"\n", "import dotenv\n", "# load evironment\n", "dotenv.load_dotenv(dotenv_path=\".env\", override=True)" @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -119,6 +119,14 @@ "example: {'id': '5a7a06935542990198eaf050', 'question': \"Which magazine was started first Arthur's Magazine or First for Women?\", 'answer': \"Arthur's Magazine\", 'type': 'comparison', 'level': 'medium', 'supporting_facts': {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}, 'context': {'title': ['Radio City (Indian radio station)', 'History of Albanian football', 'Echosmith', \"Women's colleges in the Southern United States\", 'First Arthur County Courthouse and Jail', \"Arthur's Magazine\", '2014–15 Ukrainian Hockey Championship', 'First for Women', 'Freeway Complex Fire', 'William Rast'], 'sentences': [[\"Radio City is India's first private FM radio station and was started on 3 July 2001.\", ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).', ' It plays Hindi, English and regional songs.', ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.', ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.', ' The Radio station currently plays a mix of Hindi and Regional music.', ' Abraham Thomas is the CEO of the company.'], ['Football in Albania existed before the Albanian Football Federation (FSHF) was created.', \" This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of competition, competition started first and was strong enough in the duels) .\", ' Albanian National Team was founded on June 6, 1930, but Albania had to wait 16 years to play its first international match and then defeated Yugoslavia in 1946.', ' In 1932, Albania joined FIFA (during the 12–16 June convention ) And in 1954 she was one of the founding members of UEFA.'], ['Echosmith is an American, Corporate indie pop band formed in February 2009 in Chino, California.', ' Originally formed as a quartet of siblings, the band currently consists of Sydney, Noah and Graham Sierota, following the departure of eldest sibling Jamie in late 2016.', ' Echosmith started first as \"Ready Set Go!\"', ' until they signed to Warner Bros.', ' Records in May 2012.', ' They are best known for their hit song \"Cool Kids\", which reached number 13 on the \"Billboard\" Hot 100 and was certified double platinum by the RIAA with over 1,200,000 sales in the United States and also double platinum by ARIA in Australia.', ' The song was Warner Bros.', \" Records' fifth-biggest-selling-digital song of 2014, with 1.3 million downloads sold.\", ' The band\\'s debut album, \"Talking Dreams\", was released on October 8, 2013.'], [\"Women's colleges in the Southern United States refers to undergraduate, bachelor's degree–granting institutions, often liberal arts colleges, whose student populations consist exclusively or almost exclusively of women, located in the Southern United States.\", \" Many started first as girls' seminaries or academies.\", ' Salem College is the oldest female educational institution in the South and Wesleyan College is the first that was established specifically as a college for women.', ' Some schools, such as Mary Baldwin University and Salem College, offer coeducational courses at the graduate level.'], ['The First Arthur County Courthouse and Jail, was perhaps the smallest court house in the United States, and serves now as a museum.'], [\"Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.\", ' Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.', ' In May 1846 it was merged into \"Godey\\'s Lady\\'s Book\".'], ['The 2014–15 Ukrainian Hockey Championship was the 23rd season of the Ukrainian Hockey Championship.', ' Only four teams participated in the league this season, because of the instability in Ukraine and that most of the clubs had economical issues.', ' Generals Kiev was the only team that participated in the league the previous season, and the season started first after the year-end of 2014.', ' The regular season included just 12 rounds, where all the teams went to the semifinals.', ' In the final, ATEK Kiev defeated the regular season winner HK Kremenchuk.'], [\"First for Women is a woman's magazine published by Bauer Media Group in the USA.\", ' The magazine was started in 1989.', ' It is based in Englewood Cliffs, New Jersey.', ' In 2011 the circulation of the magazine was 1,310,696 copies.'], ['The Freeway Complex Fire was a 2008 wildfire in the Santa Ana Canyon area of Orange County, California.', ' The fire started as two separate fires on November 15, 2008.', ' The \"Freeway Fire\" started first shortly after 9am with the \"Landfill Fire\" igniting approximately 2 hours later.', ' These two separate fires merged a day later and ultimately destroyed 314 residences in Anaheim Hills and Yorba Linda.'], ['William Rast is an American clothing line founded by Justin Timberlake and Trace Ayala.', ' It is most known for their premium jeans.', ' On October 17, 2006, Justin Timberlake and Trace Ayala put on their first fashion show to launch their new William Rast clothing line.', ' The label also produces other clothing items such as jackets and tops.', ' The company started first as a denim line, later evolving into a men’s and women’s clothing line.']]}}\n", "ground truth context: {'title': [\"Arthur's Magazine\", 'First for Women'], 'sent_id': [0, 0]}\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mengliu/Library/Caches/pypoetry/virtualenvs/lightrag-project-OrKUABKc-py3.12/lib/python3.12/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n", + " table = cls._concat_blocks(blocks, axis=0)\n" + ] } ], "source": [ @@ -138,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -298,8 +306,7 @@ " print(f\"response: {response['answer']}\")\n", " print(f\"ground truth response: {data['answer']}\")\n", " print(f\"context_str: {context_str}\")\n", - " print(f\"ground truth context_str: {gt_context_sentence_list}\")\n", - " break\n" + " print(f\"ground truth context_str: {gt_context_sentence_list}\")\n" ] }, { @@ -311,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -321,7 +328,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[34], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Compute the recall.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m retriever_recall \u001b[38;5;241m=\u001b[39m RetrieverRecall()\n\u001b[0;32m----> 3\u001b[0m avg_recall, recall_list \u001b[38;5;241m=\u001b[39m retriever_recall\u001b[38;5;241m.\u001b[39mcompute(\u001b[43mretrieved_contexts\u001b[49m, gt_contexts)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maverage recall: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mavg_recall\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecall list: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrecall_list\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Compute the recall.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m retriever_recall \u001b[38;5;241m=\u001b[39m RetrieverRecall()\n\u001b[0;32m----> 3\u001b[0m avg_recall, recall_list \u001b[38;5;241m=\u001b[39m retriever_recall\u001b[38;5;241m.\u001b[39mcompute(\u001b[43mretrieved_contexts\u001b[49m, gt_contexts)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maverage recall: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mavg_recall\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrecall list: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrecall_list\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mNameError\u001b[0m: name 'retrieved_contexts' is not defined" ] } From 00d4e5b3026b6349f4f1f396e11a1235a49eaa9a Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 11:51:30 -0700 Subject: [PATCH 07/20] minior fix --- docs/source/developer_notes/index.rst | 1 + docs/source/developer_notes/output_parsers.rst | 2 +- docs/source/index.rst | 1 - lightrag/pyproject.toml | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/developer_notes/index.rst b/docs/source/developer_notes/index.rst index 463b03a5..bd92915a 100644 --- a/docs/source/developer_notes/index.rst +++ b/docs/source/developer_notes/index.rst @@ -97,6 +97,7 @@ Code path: :ref:`lightrag.core`. For abstract classes: * - :doc:`model_client` - ``ModelClient`` is the protocol and base class for LightRAG to **integrate all models**, either APIs or local, LLMs or Embedding models or any others. * - :doc:`generator` + - The orchestrator for LLM prediction. It streamlines three components: `ModelClient`, `Prompt`, and `output_processors` and works with optimizer for prompt optimization. - The **center component** that orchestrates the model client(LLMs in particular), prompt, and output processors for format parsing or any post processing. * - :doc:`output_parsers` - The component that parses the output string to structured data. diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index dc1f8e4c..ebe25f6a 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -1,2 +1,2 @@ -OutputParser +Parser ============= diff --git a/docs/source/index.rst b/docs/source/index.rst index a883edb0..72033d89 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,7 +41,6 @@ It is light, modular, and robust. from lightrag.core import Component, Generator from lightrag.components.model_client import GroqAPIClient - from lightrag.utils import setup_env #noqa class SimpleQA(Component): diff --git a/lightrag/pyproject.toml b/lightrag/pyproject.toml index 2fc49ce9..4665bc46 100644 --- a/lightrag/pyproject.toml +++ b/lightrag/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "lightrag" -version = "0.0.0-alpha.7" +version = "0.0.0-alpha.8" description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator." authors = ["Li Yin "] readme = "README.md" From 86f0843ab875e55141f4d91d028872b084c382f4 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 12:08:07 -0700 Subject: [PATCH 08/20] fixed messed up tutorial index --- docs/source/developer_notes/index.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/developer_notes/index.rst b/docs/source/developer_notes/index.rst index bd92915a..b53249c8 100644 --- a/docs/source/developer_notes/index.rst +++ b/docs/source/developer_notes/index.rst @@ -76,6 +76,7 @@ Code path: :ref:`lightrag.core `. RAG Essentials ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RAG components ^^^^^^^^^^^^^^^^^^^ @@ -98,7 +99,6 @@ Code path: :ref:`lightrag.core`. For abstract classes: - ``ModelClient`` is the protocol and base class for LightRAG to **integrate all models**, either APIs or local, LLMs or Embedding models or any others. * - :doc:`generator` - The orchestrator for LLM prediction. It streamlines three components: `ModelClient`, `Prompt`, and `output_processors` and works with optimizer for prompt optimization. - - The **center component** that orchestrates the model client(LLMs in particular), prompt, and output processors for format parsing or any post processing. * - :doc:`output_parsers` - The component that parses the output string to structured data. * - :doc:`embedder` @@ -106,6 +106,7 @@ Code path: :ref:`lightrag.core`. For abstract classes: * - :doc:`retriever` - The base class for all retrievers who in particular retrieve relevant documents from a given database to add **context** to the generator. + Data Pipeline and Storage ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 5fc75c6b5396b4d1f5efa8baada8f713fbb8d260 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 12:32:54 -0700 Subject: [PATCH 09/20] parser --- docs/source/developer_notes/output_parsers.rst | 13 +++++++++++++ lightrag/lightrag/core/functional.py | 4 ++-- lightrag/lightrag/core/string_parser.py | 6 +++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index ebe25f6a..33b0505c 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -1,2 +1,15 @@ Parser ============= +In this note, we will explain LightRAG parser and output parsers. + +Context +---------------- + +Parser +---------------- +LLMs output text in string format. +Parser is a component used to parse that string into desired data structure per the use case. + +Converting string to structured data is similar to the step of deserialization in serialization-deserialization process. +We already have powerful ``DataClass`` to handle the serialization-deserialization for data class instance. +Parser builts on top of that diff --git a/lightrag/lightrag/core/functional.py b/lightrag/lightrag/core/functional.py index a71ac206..8b635e8b 100644 --- a/lightrag/lightrag/core/functional.py +++ b/lightrag/lightrag/core/functional.py @@ -936,6 +936,7 @@ def parse_yaml_str_to_obj(yaml_str: str) -> Dict[str, Any]: Parse a YAML string to a Python object. yaml_str: has to be a valid YAML string. """ + yaml_str = yaml_str.strip() try: import yaml @@ -950,8 +951,7 @@ def parse_yaml_str_to_obj(yaml_str: str) -> Dict[str, Any]: def parse_json_str_to_obj(json_str: str) -> Dict[str, Any]: - r""" - Parse a JSON string to a Python object. + r"""Parse a JSON string to a Python object. json_str: has to be a valid JSON string. Either {} or []. """ json_str = json_str.strip() diff --git a/lightrag/lightrag/core/string_parser.py b/lightrag/lightrag/core/string_parser.py index e08ecd69..44fa9387 100644 --- a/lightrag/lightrag/core/string_parser.py +++ b/lightrag/lightrag/core/string_parser.py @@ -42,7 +42,7 @@ def __call__(self, input: str) -> List[Any]: raise ValueError(f"Error: {e}") -JASON_PARSER_OUTPUT_TYPE = Dict[str, Any] +JSON_PARSER_OUTPUT_TYPE = Dict[str, object] class JsonParser(Component): @@ -62,7 +62,7 @@ def __init__(self, add_missing_right_brace: bool = True): super().__init__() self.add_missing_right_brace = add_missing_right_brace - def call(self, input: str) -> JASON_PARSER_OUTPUT_TYPE: + def call(self, input: str) -> JSON_PARSER_OUTPUT_TYPE: input = input.strip() try: json_str = F.extract_json_str(input, self.add_missing_right_brace) @@ -73,7 +73,7 @@ def call(self, input: str) -> JASON_PARSER_OUTPUT_TYPE: raise ValueError(f"Error: {e}") -YAML_PARSER_OUTPUT_TYPE = Dict[str, Any] +YAML_PARSER_OUTPUT_TYPE = Dict[str, object] class YamlParser(Component): From 2761d6d5825089c27169cec8dc7913af8d1b8d62 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 14:30:18 -0700 Subject: [PATCH 10/20] complete the scope of tutorials index and mid of parser --- developer_notes/parser_note.py | 58 +++++++++++++++++++ docs/source/developer_notes/datasets.rst | 3 + docs/source/developer_notes/index.rst | 46 ++++++++++++--- docs/source/developer_notes/optimizer.rst | 1 + .../source/developer_notes/output_parsers.rst | 32 +++++++++- docs/source/developer_notes/parameter.rst | 3 +- docs/source/developer_notes/trainer.rst | 3 + .../components/output_parsers/outputs.py | 17 +----- lightrag/lightrag/core/functional.py | 45 +++++++++++--- lightrag/lightrag/core/string_parser.py | 26 ++++++--- 10 files changed, 193 insertions(+), 41 deletions(-) create mode 100644 developer_notes/parser_note.py create mode 100644 docs/source/developer_notes/datasets.rst create mode 100644 docs/source/developer_notes/trainer.rst diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py new file mode 100644 index 00000000..ef129c65 --- /dev/null +++ b/developer_notes/parser_note.py @@ -0,0 +1,58 @@ +def examples_of_different_ways_to_parse_string(): + # string to int/float + print(int("42")) + print(float("42.0")) + + # via json loads + import json + + print(json.loads('{"key": "value"}')) + + # a more complicated case + print( + json.loads( + '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' + ) + ) + + # json load for list + print(json.loads('["key", "value"]')) + + # via yaml + import yaml + + print(yaml.safe_load("key: value")) + print( + yaml.safe_load("name: John\nage: 30\nattributes:\n height: 180\n weight: 70") + ) + print(yaml.safe_load("['key', 'value']")) + + # via ast for python literal + import ast + + print(ast.literal_eval("42")) + print(ast.literal_eval("{'key': 'value'}")) + print(ast.literal_eval("['key', 'value']")) + # complex case like dict + print( + ast.literal_eval( + "{'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}}" + ) + ) + + # via regex + + # via eval for any python expression + print(eval("42")) + print(eval("{'key': 'value'}")) + print(eval("['key', 'value']")) + # complex case like dict + print( + eval("{'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}}") + ) + + # + + +if __name__ == "__main__": + examples_of_different_ways_to_parse_string() diff --git a/docs/source/developer_notes/datasets.rst b/docs/source/developer_notes/datasets.rst new file mode 100644 index 00000000..365591a1 --- /dev/null +++ b/docs/source/developer_notes/datasets.rst @@ -0,0 +1,3 @@ +Datasets +================ +Coming soon! diff --git a/docs/source/developer_notes/index.rst b/docs/source/developer_notes/index.rst index b53249c8..02513780 100644 --- a/docs/source/developer_notes/index.rst +++ b/docs/source/developer_notes/index.rst @@ -45,6 +45,7 @@ We have a clear :doc:`lightrag_design_philosophy`, which results in this :doc:`c Building ------------------- + Base classes ~~~~~~~~~~~~~~~~~~~~~~ Code path: :ref:`lightrag.core `. @@ -96,11 +97,11 @@ Code path: :ref:`lightrag.core`. For abstract classes: * - :doc:`prompt` - Built on `jinja2`, it programmatically and flexibly formats prompts as input to the generator. * - :doc:`model_client` - - ``ModelClient`` is the protocol and base class for LightRAG to **integrate all models**, either APIs or local, LLMs or Embedding models or any others. + - The standard `protocol` to intergrate LLMs, Embedding models, ranking models, etc into respective `orchestrator` components, either via APIs or local to reach to `model agnostic`. * - :doc:`generator` - - The orchestrator for LLM prediction. It streamlines three components: `ModelClient`, `Prompt`, and `output_processors` and works with optimizer for prompt optimization. + - The `orchestrator` for LLM prediction. It streamlines three components: `ModelClient`, `Prompt`, and `output_processors` and works with optimizer for prompt optimization. * - :doc:`output_parsers` - - The component that parses the output string to structured data. + - The `interpreter` of the LLM output. The component that parses the output string to structured data. * - :doc:`embedder` - The component that orchestrates model client (Embedding models in particular) and output processors. * - :doc:`retriever` @@ -196,11 +197,24 @@ Agent in ``components.agent`` is LLM great with reasoning, planning, and using t Optimizing ------------------- -Datasets and Evaulation +Evaluating +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Part + - Description + * - :doc:`datasets` + - The datasets used in the evaluation. + * - :doc:`evaluation` + - The evaluation metrics and methods. .. toctree:: :maxdepth: 1 - :caption: Datasets and Evaulation + :caption: Evaluating + :hidden: datasets @@ -208,11 +222,27 @@ Datasets and Evaulation evaluation -Optimizer & Trainer +Training +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Tutorials in this section are coming soon!** + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Part + - Description + * - :doc:`parameter` + - The parameter class to optimize. + * - :doc:`optimizer` + - The optimizer to optimize the parameters. + * - :doc:`trainer` + - The trainer to train the model. .. toctree:: :maxdepth: 1 - :caption: Optimizer & Trainer + :caption: Training + :hidden: parameter @@ -222,7 +252,7 @@ Optimizer & Trainer Logging & Tracing & Configurations ------------------------------------ -Code path: ``lightrag.utils``. +Code path: :ref:`lightrag.utils ` and :ref:`lightrag.tracing `. .. list-table:: :widths: 20 80 diff --git a/docs/source/developer_notes/optimizer.rst b/docs/source/developer_notes/optimizer.rst index d26f6ad0..abc4c5a7 100644 --- a/docs/source/developer_notes/optimizer.rst +++ b/docs/source/developer_notes/optimizer.rst @@ -1,2 +1,3 @@ Optimizer ========================================================== +Coming soon! diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 33b0505c..32a1730c 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -1,15 +1,41 @@ Parser ============= + In this note, we will explain LightRAG parser and output parsers. Context ---------------- -Parser ----------------- LLMs output text in string format. -Parser is a component used to parse that string into desired data structure per the use case. +Parsing is the process of `extracting` and `converting` the string to desired data structure per the use case. + +This desired data structure can be: + +- simple data types like string, int, float, boolean, etc. +- complex data types like list, dict, or data class instance. +- Code like Python, SQL, html, etc. + +It honestly can be converted to any kind of formats that are required by the use case. +It is an important step for the LLM applications to interact with the external world. +Parsing is like the `interpreter` of the LLM output. + +Scope and Design +------------------ + +Right now, we aim to cover the simple and complext data types but the code. + Converting string to structured data is similar to the step of deserialization in serialization-deserialization process. We already have powerful ``DataClass`` to handle the serialization-deserialization for data class instance. + +Parser in Action +------------------ + Parser builts on top of that + + +Output Parsers in Action +-------------------------- + +Evaluate Format following +-------------------------- diff --git a/docs/source/developer_notes/parameter.rst b/docs/source/developer_notes/parameter.rst index 514e5db9..2b4b9290 100644 --- a/docs/source/developer_notes/parameter.rst +++ b/docs/source/developer_notes/parameter.rst @@ -1,2 +1,3 @@ Parameter -==================== \ No newline at end of file +==================== +Coming soon! diff --git a/docs/source/developer_notes/trainer.rst b/docs/source/developer_notes/trainer.rst new file mode 100644 index 00000000..30e90187 --- /dev/null +++ b/docs/source/developer_notes/trainer.rst @@ -0,0 +1,3 @@ +Trainer +================ +Coming soon! diff --git a/lightrag/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py index f36af45c..337c8570 100644 --- a/lightrag/lightrag/components/output_parsers/outputs.py +++ b/lightrag/lightrag/components/output_parsers/outputs.py @@ -1,8 +1,4 @@ -"""The most commonly used output parsers for the Generator. - -Note: Even with OutputParser for output_format_str formatting and the response parsing, it is not 100% guaranteed -as user query can impact the output. Test your code well! -""" +"""The most commonly used output parsers for the Generator.""" from dataclasses import is_dataclass from typing import Dict, Any, Optional, List @@ -15,13 +11,6 @@ from lightrag.core.base_data_class import ExcludeType -# TODO: might be worth to parse a list of yaml or json objects. For instance, a list of jokes. -# setup: Why couldn't the bicycle stand up by itself? -# punchline: Because it was two-tired. -# -# setup: What do you call a fake noodle? -# punchline: An impasta. - __all__ = [ "OutputParser", "YamlOutputParser", @@ -156,7 +145,7 @@ def __init__( f"Provided example is not an instance of the data class: {data_class}" ) self._exclude_fields = exclude_fields - self.data_class_for_yaml = data_class + self.data_class_for_yaml: DataClass = data_class self.yaml_output_format_prompt = Prompt(template=YAML_OUTPUT_FORMAT) self.output_processors = YamlParser() self.examples = examples @@ -221,7 +210,7 @@ def __init__( ) self._exclude_fields = exclude_fields template = JSON_OUTPUT_FORMAT - self.data_class_for_json = data_class + self.data_class_for_json: DataClass = data_class self.json_output_format_prompt = Prompt(template=template) self.output_processors = JsonParser() self.examples = examples diff --git a/lightrag/lightrag/core/functional.py b/lightrag/lightrag/core/functional.py index 8b635e8b..b45e0ff0 100644 --- a/lightrag/lightrag/core/functional.py +++ b/lightrag/lightrag/core/functional.py @@ -801,11 +801,27 @@ def generate_readable_key_for_function(fn: Callable) -> str: return f"{module_name}.{function_name}" +######################################################################################## +# For Parser components +######################################################################################## + + def extract_json_str(text: str, add_missing_right_brace: bool = True) -> str: - """ - Extract JSON string from text. - NOTE: Only handles the first JSON object or array found in the text. And it expects at least one JSON object in the text. + """Extract JSON string from text. + + It will extract the first JSON object or array found in the text by searching for { or [. If right brace is not found, we add one to the end of the string. + + Args: + text (str): The text containing potential JSON data. + add_missing_right_brace (bool): Whether to add a missing right brace if it is missing. + + Returns: + str: The extracted JSON string. + + Raises: + ValueError: If no JSON object or array is found or if the JSON extraction is incomplete + without the option to add a missing brace """ # NOTE: this regex parsing is taken from langchain.output_parsers.pydantic text = text.strip() @@ -846,8 +862,9 @@ def extract_json_str(text: str, add_missing_right_brace: bool = True) -> str: def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: - """ - Extract the first complete list string from the provided text. If the list string is incomplete + """Extract the first complete list string from the provided text. + + If the list string is incomplete (missing the closing bracket), an option allows adding a closing bracket at the end. Args: @@ -894,7 +911,18 @@ def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: def extract_yaml_str(text: str) -> str: r"""Extract YAML string from text. - In default, we use regex pattern to match yaml code blocks within triple backticks with optional yaml or yml prefix. + .. note:: + As yaml string does not have a format like JSON which we can extract from {} or [], + it is crucial to have a format such as ```yaml``` or ```yml``` to indicate the start of the yaml string. + + Args: + text (str): The text containing potential YAML data. + + Returns: + str: The extracted YAML string. + + Raises: + ValueError: If no YAML string is found in the text. """ try: yaml_re_pattern: re.Pattern = re.compile( @@ -932,8 +960,8 @@ def fix_json_escaped_single_quotes(json_str: str) -> str: def parse_yaml_str_to_obj(yaml_str: str) -> Dict[str, Any]: - r""" - Parse a YAML string to a Python object. + r"""Parse a YAML string to a Python object. + yaml_str: has to be a valid YAML string. """ yaml_str = yaml_str.strip() @@ -952,6 +980,7 @@ def parse_yaml_str_to_obj(yaml_str: str) -> Dict[str, Any]: def parse_json_str_to_obj(json_str: str) -> Dict[str, Any]: r"""Parse a JSON string to a Python object. + json_str: has to be a valid JSON string. Either {} or []. """ json_str = json_str.strip() diff --git a/lightrag/lightrag/core/string_parser.py b/lightrag/lightrag/core/string_parser.py index 44fa9387..106d998f 100644 --- a/lightrag/lightrag/core/string_parser.py +++ b/lightrag/lightrag/core/string_parser.py @@ -1,4 +1,7 @@ -""" +"""Extract and convert JSON, YAML, and list strings to Python objects. + +It can be used as output_processor for generator. + LLM applications requires lots of string processing. Such as the text output needed to be parsed into: (1) JSON format or other formats (2) SQL/Python valid format @@ -7,7 +10,7 @@ We design this these string_parser modules to be generic to any input text without differentiating them as input text or output text. """ -from typing import Any, Dict, List +from typing import Dict, List, Union import logging from lightrag.core.component import Component @@ -15,9 +18,15 @@ log = logging.getLogger(__name__) +LIST_PARSER_OUTPUT_TYPE = List[object] + class ListParser(Component): - __doc__ = r"""To extract list strings from text and parse them into a list object. + __doc__ = r"""Extracts list `[...]` strings from text and parses them into a list object. + + Args: + add_missing_right_bracket (bool, optional): Add a missing right bracket to the list string. Defaults to True. + Examples: @@ -32,7 +41,7 @@ def __init__(self, add_missing_right_bracket: bool = True): super().__init__() self.add_missing_right_bracket = add_missing_right_bracket - def __call__(self, input: str) -> List[Any]: + def __call__(self, input: str) -> LIST_PARSER_OUTPUT_TYPE: input = input.strip() try: list_str = F.extract_list_str(input, self.add_missing_right_bracket) @@ -42,11 +51,14 @@ def __call__(self, input: str) -> List[Any]: raise ValueError(f"Error: {e}") -JSON_PARSER_OUTPUT_TYPE = Dict[str, object] +JSON_PARSER_OUTPUT_TYPE = Union[Dict[str, object], List[object]] class JsonParser(Component): - __doc__ = r"""To extract JSON strings from text and parse them into a JSON object. + __doc__ = r"""Extracts JSON strings `{...}` or `[...]` from text and parses them into a JSON object. + + It can output either a dictionary or a list as they are both valid JSON objects. + Examples: @@ -73,7 +85,7 @@ def call(self, input: str) -> JSON_PARSER_OUTPUT_TYPE: raise ValueError(f"Error: {e}") -YAML_PARSER_OUTPUT_TYPE = Dict[str, object] +YAML_PARSER_OUTPUT_TYPE = JSON_PARSER_OUTPUT_TYPE class YamlParser(Component): From 38a234e7f6fcf94f07245e3dbe42c0bb248687df Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 16:25:42 -0700 Subject: [PATCH 11/20] add basic parser, add test case --- developer_notes/parser_note.py | 140 ++++++++++++----- .../source/developer_notes/output_parsers.rst | 98 +++++++++++- lightrag/lightrag/core/functional.py | 77 ++++++++-- lightrag/lightrag/core/string_parser.py | 141 ++++++++++++++++-- lightrag/tests/test_string_parser.py | 137 ++++++++++++++++- 5 files changed, 527 insertions(+), 66 deletions(-) diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py index ef129c65..973b0a9d 100644 --- a/developer_notes/parser_note.py +++ b/developer_notes/parser_note.py @@ -1,57 +1,123 @@ def examples_of_different_ways_to_parse_string(): - # string to int/float - print(int("42")) - print(float("42.0")) + + int_str = "42" + float_str = "42.0" + boolean_str = "True" # json works with true/false + None_str = "None" + Null_str = "null" # json works with null + dict_str = '{"key": "value"}' + list_str = '["key", "value"]' + nested_dict_str = ( + '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' + ) + yaml_dict_str = "key: value" + yaml_nested_dict_str = ( + "name: John\nage: 30\nattributes:\n height: 180\n weight: 70" + ) + yaml_list_str = "- key\n- value" + + # string to int/float/bool + print("built-in parser:\n____________________") + print(int(int_str)) + print(float(float_str)) + print(bool(boolean_str)) # via json loads import json - print(json.loads('{"key": "value"}')) + print("\njson parser:\n____________________") + json_int = json.loads(int_str) + json_float = json.loads(float_str) + json_bool = json.loads( + boolean_str.lower() + ) # json.loads only accepts true or false, not True or False + json_none = json.loads(Null_str) + json_dict = json.loads(dict_str) + json_list = json.loads(list_str) + json_nested_dict = json.loads(nested_dict_str) + # json_yaml_dict = json.loads(yaml_dict_str) # wont work + # json_yaml_nested_dict = json.loads(yaml_nested_dict_str) + # json_yaml_list = json.loads(yaml_list_str) - # a more complicated case - print( - json.loads( - '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' - ) - ) - - # json load for list - print(json.loads('["key", "value"]')) + print(int_str, type(json_int), json_int) + print(float_str, type(json_float), json_float) + print(boolean_str, type(json_bool), json_bool) + print(None_str, type(json_none), json_none) + print(dict_str, type(json_dict), json_dict) + print(list_str, type(json_list), json_list) + print(nested_dict_str, type(json_nested_dict), json_nested_dict) # via yaml import yaml - print(yaml.safe_load("key: value")) - print( - yaml.safe_load("name: John\nage: 30\nattributes:\n height: 180\n weight: 70") - ) - print(yaml.safe_load("['key', 'value']")) + print("\nyaml parser:\n____________________") + + yaml_int = yaml.safe_load(int_str) + yaml_float = yaml.safe_load(float_str) + yaml_bool = yaml.safe_load(boolean_str) + yaml_bool_lower = yaml.safe_load(boolean_str.lower()) + yaml_null = yaml.safe_load(Null_str) + yaml_none = yaml.safe_load(None_str) + + yaml_dict = yaml.safe_load(dict_str) + yaml_list = yaml.safe_load(list_str) + yaml_nested_dict = yaml.safe_load(nested_dict_str) + yaml_yaml_dict = yaml.safe_load(yaml_dict_str) + yaml_yaml_nested_dict = yaml.safe_load(yaml_nested_dict_str) + yaml_yaml_list = yaml.safe_load(yaml_list_str) + + print(int_str, type(yaml_int), yaml_int) + print(float_str, type(yaml_float), yaml_float) + print(boolean_str, type(yaml_bool), yaml_bool) + print(boolean_str.lower(), type(yaml_bool_lower), yaml_bool_lower) + print(Null_str, type(yaml_null), yaml_null) + print(None_str, type(yaml_none), yaml_none) + print(dict_str, type(yaml_dict), yaml_dict) + print(list_str, type(yaml_list), yaml_list) + print(nested_dict_str, type(yaml_nested_dict), yaml_nested_dict) + print(yaml_dict_str, type(yaml_yaml_dict), yaml_yaml_dict) + print(yaml_nested_dict_str, type(yaml_yaml_nested_dict), yaml_yaml_nested_dict) + print(yaml_list_str, type(yaml_yaml_list), yaml_yaml_list) # via ast for python literal import ast - print(ast.literal_eval("42")) - print(ast.literal_eval("{'key': 'value'}")) - print(ast.literal_eval("['key', 'value']")) - # complex case like dict - print( - ast.literal_eval( - "{'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}}" - ) - ) + print("\nast parser:\n____________________\n") - # via regex + ast_int = ast.literal_eval(int_str) + ast_float = ast.literal_eval(float_str) + ast_bool = ast.literal_eval(boolean_str) + ast_none = ast.literal_eval(None_str) + ast_dict = ast.literal_eval(dict_str) + ast_list = ast.literal_eval(list_str) + ast_nested_dict = ast.literal_eval(nested_dict_str) - # via eval for any python expression - print(eval("42")) - print(eval("{'key': 'value'}")) - print(eval("['key', 'value']")) - # complex case like dict - print( - eval("{'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}}") - ) + print(int_str, type(ast_int), ast_int) + print(float_str, type(ast_float), ast_float) + print(boolean_str, type(ast_bool), ast_bool) + print(None_str, type(ast_none), ast_none) + print(dict_str, type(ast_dict), ast_dict) + print(list_str, type(ast_list), ast_list) + print(nested_dict_str, type(ast_nested_dict), ast_nested_dict) + + # via eval for any python expression, but not recommended for security reasons + + print("\n eval parser:\n____________________\n") + + eval_int = eval(int_str) + eval_float = eval(float_str) + eval_bool = eval(boolean_str) + eval_dict = eval(dict_str) + eval_list = eval(list_str) + eval_nested = eval(nested_dict_str) + # eval_yaml_dict = eval(yaml_dict_str) # wont work - # + print(int_str, type(eval_int), eval_int) + print(float_str, type(eval_float), eval_float) + print(boolean_str, type(eval_bool), eval_bool) + print(dict_str, type(eval_dict), eval_dict) + print(list_str, type(eval_list), eval_list) + print(nested_dict_str, type(eval_nested), eval_nested) if __name__ == "__main__": diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 32a1730c..0b192898 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -16,17 +16,92 @@ This desired data structure can be: - Code like Python, SQL, html, etc. It honestly can be converted to any kind of formats that are required by the use case. -It is an important step for the LLM applications to interact with the external world. -Parsing is like the `interpreter` of the LLM output. +It is an important step for the LLM applications to interact with the external world, such as: + +- to int to support classification and float to support regression. +- to list to support multiple choice selection. +- to json/yaml which will be extracted to dict, and optional further to data class instance to support support cases like function calls. + +Parsing is the `interpreter` of the LLM output. Scope and Design ------------------ -Right now, we aim to cover the simple and complext data types but the code. +*Right now, we aim to cover the simple and complext data types but the code.* + +**Parse** + +The following list the scope of our current support of parsing: + +.. code-block:: python + + int_str = "42" + float_str = "42.0" + boolean_str = "True" # json works with true/false, yaml works for both True/False and true/false + None_str = "None" + Null_str = "null" # json works with null, yaml works with both null and None + dict_str = '{"key": "value"}' + list_str = '["key", "value"]' + nested_dict_str = ( + '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' + ) + yaml_dict_str = "key: value" + yaml_nested_dict_str = ( + "name: John\nage: 30\nattributes:\n height: 180\n weight: 70" + ) + yaml_list_str = "- key\n- value" + +In Python, there are various ways to parse the string: +Use built-in functions like ``int``, ``float``, ``bool`` can handle the simple types. +We can use ``ast.literal_eval`` and ``json.loads()`` to handle the complex types like dict, list, and nested dict. +However, none of them is as robust as ``yaml.safe_load``. Yaml can: + +- Parse `True/False` and 'true/false' to boolean. +- Parse `None` and 'null' to None. +- Handle nested dict and list in both yaml and json format. + +Thus, we will use ``yaml.safe_load`` as the last resort for robust parsing to handle complex data types to get `List` and `Dict` data types. +We will use `int`, `float`, `bool` for simple data types. + +Parser +~~~~~~~~~~~~~~ + +Our parser is located at :doc:`core.string_parser`. +It handles both `extracting` and `parsing` to python object types. +And it is designed to be robust. + +.. list-table:: Parser Classes + :header-rows: 1 + :widths: 25 75 + + * - Parser Class + - Description + * - :class:`BooleanParser` + - Extracts the first boolean value from the text with ``bool``. Supports both `True/False` and 'true/false'. + * - :class:`IntParser` + - Extracts the first integer value from the text with ``int``. + * - :class:`FloatParser` + - Extracts the first float value from the text with ``float``. + * - :class:`ListParser` + - Extracts and parses the first list string from the text. Uses both `json.loads` and `yaml.safe_load`. + * - :class:`JsonParser` + - Extracts and parses JSON strings from the text. It resorts to `yaml.safe_load` for robust parsing. + * - :class:`YamlParser` + - Extracts and parses YAML strings from the text. + + + +**Data Class Instance** + +If your parsed object is dictionary, you can define and use ``DataClass`` instance. +With ``from_dict`` method, you can easily convert the dictionary to data class instance. + +.. Converting string to structured data is similar to the step of deserialization in serialization-deserialization process. +.. We already have powerful ``DataClass`` to handle the serialization-deserialization for data class instance. +Output Parsers +~~~~~~~~~~~~~~~~~~~~ -Converting string to structured data is similar to the step of deserialization in serialization-deserialization process. -We already have powerful ``DataClass`` to handle the serialization-deserialization for data class instance. Parser in Action ------------------ @@ -39,3 +114,16 @@ Output Parsers in Action Evaluate Format following -------------------------- + +.. admonition:: References + :class: highlight + + .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ + .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ + + +.. admonition:: API References + :class: highlight + + - :ref:`string_parser` + - :ref:`OutputParser` diff --git a/lightrag/lightrag/core/functional.py b/lightrag/lightrag/core/functional.py index b45e0ff0..4c94401d 100644 --- a/lightrag/lightrag/core/functional.py +++ b/lightrag/lightrag/core/functional.py @@ -804,6 +804,58 @@ def generate_readable_key_for_function(fn: Callable) -> str: ######################################################################################## # For Parser components ######################################################################################## +def extract_first_int(text: str) -> int: + """Extract the first integer from the provided text. + + Args: + text (str): The text containing potential integer data. + + Returns: + int: The extracted integer. + + Raises: + ValueError: If no integer is found in the text. + """ + match = re.search(r"\b\d+\b", text) + if match: + return int(match.group()) + raise ValueError("No integer found in the text.") + + +def extract_first_float(text: str) -> float: + """Extract the first float from the provided text. + + Args: + text (str): The text containing potential float data. + + Returns: + float: The extracted float. + + Raises: + ValueError: If no float is found in the text. + """ + match = re.search(r"\b\d+\.\d+\b", text) + if match: + return float(match.group()) + raise ValueError("No float found in the text.") + + +def extract_first_boolean(text: str) -> bool: + """Extract the first boolean from the provided text. + + Args: + text (str): The text containing potential boolean data. + + Returns: + bool: The extracted boolean. + + Raises: + ValueError: If no boolean is found in the text. + """ + match = re.search(r"\b(?:true|false|True|False)\b", text) + if match: + return match.group().lower() == "true" + raise ValueError("No boolean found in the text.") def extract_json_str(text: str, add_missing_right_brace: bool = True) -> str: @@ -932,7 +984,7 @@ def extract_yaml_str(text: str) -> str: yaml_str = "" if match: - yaml_str = match.group("yaml") + yaml_str = match.group("yaml").strip() else: yaml_str = text.strip() return yaml_str @@ -978,40 +1030,41 @@ def parse_yaml_str_to_obj(yaml_str: str) -> Dict[str, Any]: raise ImportError("Please pip install PyYAML.") from exc -def parse_json_str_to_obj(json_str: str) -> Dict[str, Any]: - r"""Parse a JSON string to a Python object. +def parse_json_str_to_obj(json_str: str) -> Union[Dict[str, Any], List[Any]]: + r"""Parse a varietry of json format string to Python object. json_str: has to be a valid JSON string. Either {} or []. """ json_str = json_str.strip() + # 1st attemp with json.loads try: json_obj = json.loads(json_str) return json_obj - except json.JSONDecodeError: + except json.JSONDecodeError as e: + log.info( + f"Got invalid JSON object with json.loads. Error: {e}. Got JSON string: {json_str}" + ) # 2nd attemp after fixing the json string try: - print("Trying to fix potential missing commas...") + log.info("Trying to fix potential missing commas...") json_str = fix_json_missing_commas(json_str) - print("Trying to fix scaped single quotes...") + log.info("Trying to fix scaped single quotes...") json_str = fix_json_escaped_single_quotes(json_str) - print(f"Fixed JSON string: {json_str}") + log.info(f"Fixed JSON string: {json_str}") json_obj = json.loads(json_str) return json_obj except json.JSONDecodeError: # 3rd attemp using yaml try: - import yaml # NOTE: parsing again with pyyaml # pyyaml is less strict, and allows for trailing commas # right now we rely on this since guidance program generates # trailing commas - print("Parsing JSON string with PyYAML...") + log.info("Parsing JSON string with PyYAML...") json_obj = yaml.safe_load(json_str) return json_obj except yaml.YAMLError as e: raise ValueError( - f"Got invalid JSON object. Error: {e}. Got JSON string: {json_str}" + f"Got invalid JSON object with yaml.safe_load. Error: {e}. Got JSON string: {json_str}" ) - except NameError as exc: - raise ImportError("Please pip install PyYAML.") from exc diff --git a/lightrag/lightrag/core/string_parser.py b/lightrag/lightrag/core/string_parser.py index 106d998f..349d0228 100644 --- a/lightrag/lightrag/core/string_parser.py +++ b/lightrag/lightrag/core/string_parser.py @@ -1,14 +1,6 @@ -"""Extract and convert JSON, YAML, and list strings to Python objects. +"""Extract and convert common string to Python objects. -It can be used as output_processor for generator. - -LLM applications requires lots of string processing. Such as the text output needed to be parsed into: -(1) JSON format or other formats -(2) SQL/Python valid format -(3) Tool(function) call format - -We design this these string_parser modules to be generic to any input text without differentiating them as input text or output text. -""" +From simple data types like boolean, integer, and float to more complex data types like JSON, YAML, and list strings.""" from typing import Dict, List, Union import logging @@ -18,6 +10,96 @@ log = logging.getLogger(__name__) +BOOLEAN_PARSER_OUTPUT_TYPE = bool + + +class BooleanParser(Component): + __doc__ = r"""Extracts boolean values from text. + + Examples: + + .. code-block:: python + + boolean_parser = BooleanParser() + test_input_1 = "True" # or "true" or "...true..." + print(boolean_parser(test_input_1)) # Expected to extract True + """ + + def __init__(self): + super().__init__() + + def __call__(self, input: str) -> BOOLEAN_PARSER_OUTPUT_TYPE: + input = input.strip() + try: + return F.extract_first_boolean(input) + except Exception as e: + raise ValueError(f"Error: {e}") + + +INT_PARSER_OUTPUT_TYPE = int + + +class IntParser(Component): + __doc__ = r"""Extracts integer values from text. + + Returns: + int: Extracted integer value. + + Raises: + ValueError: If the input text does not contain an integer + + Examples: + + .. code-block:: python + + int_parser = IntParser() + test_input_2 = "123" # or "...123..." + print(int_parser(test_input_2)) # Expected to extract 123 + """ + + def __init__(self): + super().__init__() + + def __call__(self, input: str) -> INT_PARSER_OUTPUT_TYPE: + input = input.strip() + try: + return F.extract_first_int(input) + except Exception as e: + raise ValueError(f"Error: {e}") + + +FLOAT_PARSER_OUTPUT_TYPE = float + + +class FloatParser(Component): + __doc__ = r"""Extracts float values from text. + + Returns: + float: Extracted float value. + + Raises: + ValueError: If the input text does not contain a float + + Examples: + + .. code-block:: python + + float_parser = FloatParser() + test_input_3 = "123.45" # or "...123.45..." + print(float_parser(test_input_3)) # Expected to extract 123.45 + """ + + def __init__(self): + super().__init__() + + def __call__(self, input: str) -> FLOAT_PARSER_OUTPUT_TYPE: + input = input.strip() + try: + return F.extract_first_float(input) + except Exception as e: + raise ValueError(f"Error: {e}") + + LIST_PARSER_OUTPUT_TYPE = List[object] @@ -27,6 +109,11 @@ class ListParser(Component): Args: add_missing_right_bracket (bool, optional): Add a missing right bracket to the list string. Defaults to True. + Returns: + List[object]: Extracted list object. + + Raises: + ValueError: If the input text does not contain a list Examples: @@ -43,12 +130,20 @@ def __init__(self, add_missing_right_bracket: bool = True): def __call__(self, input: str) -> LIST_PARSER_OUTPUT_TYPE: input = input.strip() + list_str = None + # Extract list string try: list_str = F.extract_list_str(input, self.add_missing_right_bracket) + + except Exception as e: + raise ValueError(f"Error at extracting list string: {e}") + + # Parse list string with json.loads and yaml.safe_load + try: list_obj = F.parse_json_str_to_obj(list_str) return list_obj except Exception as e: - raise ValueError(f"Error: {e}") + log.error(f"Error at parsing list string with json.loads: {e}") JSON_PARSER_OUTPUT_TYPE = Union[Dict[str, object], List[object]] @@ -59,6 +154,15 @@ class JsonParser(Component): It can output either a dictionary or a list as they are both valid JSON objects. + Args: + add_missing_right_brace (bool, optional): Add a missing right brace to the JSON string. Defaults to True. + + Returns: + Union[Dict[str, object], List[object]]: Extracted JSON object. + + Raises: + ValueError: If the input text does not contain a JSON object + Examples: @@ -76,12 +180,20 @@ def __init__(self, add_missing_right_brace: bool = True): def call(self, input: str) -> JSON_PARSER_OUTPUT_TYPE: input = input.strip() + # Extract JSON string + json_str = None try: json_str = F.extract_json_str(input, self.add_missing_right_brace) log.debug(f"json_str: {json_str}") + + except Exception as e: + raise ValueError(f"Error: {e}") + # Parse JSON string with json.loads and yaml.safe_load + try: json_obj = F.parse_json_str_to_obj(json_str) return json_obj except Exception as e: + log.error(f"Error at parsing JSON string: {e}") raise ValueError(f"Error: {e}") @@ -91,6 +203,12 @@ def call(self, input: str) -> JSON_PARSER_OUTPUT_TYPE: class YamlParser(Component): __doc__ = r"""To extract YAML strings from text and parse them into a YAML object. + Returns: + JSON_PARSER_OUTPUT_TYPE: Extracted YAML object. + + Raises: + ValueError: If the input text does not contain a YAML object + Examples: .. code-block:: python @@ -106,6 +224,7 @@ def __init__(self): def call(self, input: str) -> YAML_PARSER_OUTPUT_TYPE: input = input.strip() + # parse YAML string with yaml.safe_load try: yaml_str = F.extract_yaml_str(input) yaml_obj = F.parse_yaml_str_to_obj(yaml_str) diff --git a/lightrag/tests/test_string_parser.py b/lightrag/tests/test_string_parser.py index ab07fdf3..4fbe0abc 100644 --- a/lightrag/tests/test_string_parser.py +++ b/lightrag/tests/test_string_parser.py @@ -1,14 +1,22 @@ import pytest +import unittest from lightrag.core.string_parser import ( JsonParser, + IntParser, + FloatParser, + BooleanParser, ) from lightrag.core.functional import ( extract_json_str, fix_json_missing_commas, fix_json_escaped_single_quotes, + extract_yaml_str, ) +from lightrag.utils.logger import enable_library_logging + +enable_library_logging() ################################################## @@ -149,7 +157,8 @@ def test_json_parser_handling_decode_error(): # Deliberately malformed JSON that is also problematic for YAML text = '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}]}' with pytest.raises(ValueError) as excinfo: - parser(text) + output = parser(text) + print(f"output: {output}") assert "Got invalid JSON object" in str(excinfo.value) @@ -162,7 +171,133 @@ def test_json_parser_escape_single_quotes(): "action": "ask_for_information("company information and founder\'s profile")" } """ + # "ask_for_information(\"company information and founder's profile\")" with pytest.raises(ValueError) as excinfo: result = parser(text) print(f"result: {result}") assert "Got invalid JSON object" in str(excinfo.value) + + +class TestExtractYamlStr(unittest.TestCase): + + def test_extract_yaml_with_triple_backticks(self): + text = """```yaml +name: John +age: 30 +attributes: + height: 180 + weight: 70 +```""" + expected = """ +name: John +age: 30 +attributes: + height: 180 + weight: 70 +""" + result = extract_yaml_str(text) + print(f"triple backticks result: {result}") + self.assertEqual(result, expected.strip()) + + def test_extract_yaml_with_triple_backticks_no_yaml(self): + text = """``` +name: John +age: 30 +attributes: + height: 180 + weight: 70 +```""" + expected = """ +name: John +age: 30 +attributes: + height: 180 + weight: 70 +""" + result = extract_yaml_str(text) + self.assertEqual(result, expected.strip()) + + def test_extract_yaml_without_triple_backticks(self): + text = """ +name: John +age: 30 +attributes: + height: 180 + weight: 70 +""" + expected = """ +name: John +age: 30 +attributes: + height: 180 + weight: 70 +""" + result = extract_yaml_str(text) + self.assertEqual(result, expected.strip()) + + def test_no_yaml_string_found(self): + text = """Some random text without YAML format.""" + expected = "Some random text without YAML format." + result = extract_yaml_str(text) + self.assertEqual(result, expected.strip()) + + def test_incomplete_yaml_format(self): + text = """```yaml +name: John +age: 30 +attributes: + height: 180 + weight: 70""" + expected = """ +name: John +age: 30 +attributes: + height: 180 + weight: 70""" + result = extract_yaml_str(text) + self.assertEqual(result, expected.strip()) + + +class TestBooleanParser(unittest.TestCase): + def setUp(self): + self.parser = BooleanParser() + + def test_true(self): + self.assertTrue(self.parser("true")) + self.assertTrue(self.parser("True")) + self.assertTrue(self.parser("...true...")) + + def test_false(self): + self.assertFalse(self.parser("false")) + self.assertFalse(self.parser("False")) + self.assertFalse(self.parser("...false...")) + + def test_no_boolean(self): + with self.assertRaises(ValueError): + self.parser("no boolean here") + + +class TestIntParser(unittest.TestCase): + def setUp(self): + self.parser = IntParser() + + def test_integer(self): + self.assertEqual(self.parser("123"), 123) + self.assertEqual(self.parser("...123..."), 123) + + def test_no_integer(self): + with self.assertRaises(ValueError): + self.parser("no integer here") + + +class TestFloatParser(unittest.TestCase): + def setUp(self): + self.parser = FloatParser() + + def test_float(self): + self.assertEqual(self.parser("123.45"), 123.45) + self.assertEqual(self.parser("...123.45..."), 123.45) + + def test_no_float(self): + with self.assertRaises(ValueError): + self.parser("no float here") From 6e27c5063b51afcd384d9fb4b3deb45a7d449961 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 18:35:44 -0700 Subject: [PATCH 12/20] added examples for output parsers --- developer_notes/parser_note.py | 113 ++++++++ .../source/developer_notes/output_parsers.rst | 245 +++++++++++++++++- lightrag/lightrag/core/functional.py | 7 +- lightrag/lightrag/core/string_parser.py | 1 + 4 files changed, 352 insertions(+), 14 deletions(-) diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py index 973b0a9d..56b8d3bb 100644 --- a/developer_notes/parser_note.py +++ b/developer_notes/parser_note.py @@ -120,5 +120,118 @@ def examples_of_different_ways_to_parse_string(): print(nested_dict_str, type(eval_nested), eval_nested) +def int_parser(): + from lightrag.core.string_parser import IntParser + + int_str = "42" + int_str_2 = "42.0" + int_str_3 = "42.7" + int_str_4 = "the answer is 42.75" + + # it will all return 42 + parser = IntParser() + print(parser) + print(parser(int_str)) + print(parser(int_str_2)) + print(parser(int_str_3)) + print(parser(int_str_4)) + + +def float_parser(): + from lightrag.core.string_parser import FloatParser + + float_str = "42.0" + float_str_2 = "42" + float_str_3 = "42.7" + float_str_4 = "the answer is 42.75" + + # it will all return 42.0 + parser = FloatParser() + print(parser(float_str)) + print(parser(float_str_2)) + print(parser(float_str_3)) + print(parser(float_str_4)) + + +def bool_parser(): + from lightrag.core.string_parser import BooleanParser + + bool_str = "True" + bool_str_2 = "False" + bool_str_3 = "true" + bool_str_4 = "false" + # bool_str_5 = "1" # will fail + # bool_str_6 = "0" # will fail + # bool_str_7 = "yes" # will fail + # bool_str_8 = "no" # will fail + + # it will all return True/False + parser = BooleanParser() + print(parser(bool_str)) + print(parser(bool_str_2)) + print(parser(bool_str_3)) + print(parser(bool_str_4)) + # print(parser(bool_str_5)) + # print(parser(bool_str_6)) + # print(parser(bool_str_7)) + # print(parser(bool_str_8)) + + +def list_parser(): + + from lightrag.core.string_parser import ListParser + + list_str = '["key", "value"]' + list_str_2 = 'prefix["key", 2]...' + list_str_3 = '[{"key": "value"}, {"key": "value"}]' + # dict_str = '{"key": "value"}' + + parser = ListParser() + print(parser(list_str)) + print(parser(list_str_2)) + print(parser(list_str_3)) + # print(parser(dict_str)) # will raise ValueError + + +def json_parser(): + from lightrag.core.string_parser import JsonParser + + dict_str = '{"key": "value"}' + nested_dict_str = ( + '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' + ) + list_str = '["key", 2]' + list_dict_str = '[{"key": "value"}, {"key": "value"}]' + + parser = JsonParser() + print(parser) + print(parser(dict_str)) + print(parser(nested_dict_str)) + print(parser(list_str)) + print(parser(list_dict_str)) + + +def yaml_parser(): + from lightrag.core.string_parser import YamlParser + + yaml_dict_str = "key: value" + yaml_nested_dict_str = ( + "name: John\nage: 30\nattributes:\n height: 180\n weight: 70" + ) + yaml_list_str = "- key\n- value" + + parser = YamlParser() + print(parser) + print(parser(yaml_dict_str)) + print(parser(yaml_nested_dict_str)) + print(parser(yaml_list_str)) + + if __name__ == "__main__": examples_of_different_ways_to_parse_string() + int_parser() + float_parser() + bool_parser() + list_parser() + json_parser() + yaml_parser() diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 0b192898..fa2bfa7d 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -70,24 +70,51 @@ Our parser is located at :doc:`core.string_parser`. It handles both `extracting` and `parsing` to python object types. And it is designed to be robust. -.. list-table:: Parser Classes +.. list-table:: :header-rows: 1 - :widths: 25 75 + :widths: 25 25 50 * - Parser Class + - Target Python Object - Description * - :class:`BooleanParser` - - Extracts the first boolean value from the text with ``bool``. Supports both `True/False` and 'true/false'. + - ``bool`` + - Extracts the first boolean value from the text with ``bool``. Supports both 'True/False' and 'true/false'. * - :class:`IntParser` + - ``int`` - Extracts the first integer value from the text with ``int``. * - :class:`FloatParser` + - ``float`` - Extracts the first float value from the text with ``float``. * - :class:`ListParser` - - Extracts and parses the first list string from the text. Uses both `json.loads` and `yaml.safe_load`. + - ``list`` + - Extracts '[]' and parses the first list string from the text. Uses both `json.loads` and `yaml.safe_load`. * - :class:`JsonParser` - - Extracts and parses JSON strings from the text. It resorts to `yaml.safe_load` for robust parsing. + - ``dict`` + - Extracts '[]' and '{}' and parses JSON strings from the text. It resorts to `yaml.safe_load` for robust parsing. * - :class:`YamlParser` - - Extracts and parses YAML strings from the text. + - ``dict`` + - Extracts '```yaml```', '```yml```' or the whole string and parses YAML strings from the text. + + +.. .. list-table:: Parser Classes +.. :header-rows: 1 +.. :widths: 25 75 + +.. * - Parser Class +.. - Description +.. * - :class:`BooleanParser` +.. - Extracts the first boolean value from the text with ``bool``. Supports both 'True/False' and 'true/false'. +.. * - :class:`IntParser` +.. - Extracts the first integer value from the text with ``int``. +.. * - :class:`FloatParser` +.. - Extracts the first float value from the text with ``float``. +.. * - :class:`ListParser` +.. - Extracts and parses the first list string from the text. Uses both `json.loads` and `yaml.safe_load`. Use this for ``list`` object type. +.. * - :class:`JsonParser` +.. - Extracts and parses JSON strings from the text. It resorts to `yaml.safe_load` for robust parsing. Use this for ``dict`` object type. +.. * - :class:`YamlParser` +.. - Extracts and parses YAML strings from the text. Use this for ``dict`` object type. @@ -101,25 +128,215 @@ With ``from_dict`` method, you can easily convert the dictionary to data class i Output Parsers ~~~~~~~~~~~~~~~~~~~~ +The above parsers do not come with output format instructions. +Thus, we created :class:`OutputParser` to orchestrate both the formatting and parsing process. +It is an abstract component with two main methods: + +- ``format_instructions``: to generate the output format instructions for the prompt. +- ``call``: to parse the output string to the desired python object. + +If you are targetting at ``dict`` object, we already have ``DataClass`` to help us describe any data class type and instance that can be easily used to interact with LLMs. +Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arguments: + +- ``data_class``: the ``DataClass`` type. +- ``examples``: the examples of the data class instance if you want to show the examples in the prompt. +- ``exclude``: the fields to exclude from both the data format and the examples. +.. TODO: a summary table Parser in Action ------------------ +All of the parsers are quite straightforward to use. + +**BooleanParser** + +.. code-block:: python + + from lightrag.core.string_parser import BooleanParser + + bool_str = "True" + bool_str_2 = "False" + bool_str_3 = "true" + bool_str_4 = "false" + bool_str_5 = "1" # will fail + bool_str_6 = "0" # will fail + bool_str_7 = "yes" # will fail + bool_str_8 = "no" # will fail + + # it will all return True/False + parser = BooleanParser() + print(parser(bool_str)) + print(parser(bool_str_2)) + print(parser(bool_str_3)) + print(parser(bool_str_4)) + +The printout will be: + +.. code-block:: + + True + False + True + False + +Boolean parsers will not work for '1', '0', 'yes', 'no' as they are not the standard boolean values. + +**IntParser** + +.. code-block:: python + + rom lightrag.core.string_parser import IntParser + + int_str = "42" + int_str_2 = "42.0" + int_str_3 = "42.7" + int_str_4 = "the answer is 42.75" + + # it will all return 42 + parser = IntParser() + print(parser(int_str)) + print(parser(int_str_2)) + print(parser(int_str_3)) + print(parser(int_str_4)) + +The printout will be: + +.. code-block:: + + 42 + 42 + 42 + 42 + +``IntParser`` will return the integer value of the first number in the string, even if it is a float. + +**FloatParser** + +.. code-block:: python + + from lightrag.core.string_parser import FloatParser + + float_str = "42.0" + float_str_2 = "42" + float_str_3 = "42.7" + float_str_4 = "the answer is 42.75" + + # it will all return 42.0 + parser = FloatParser() + print(parser(float_str)) + print(parser(float_str_2)) + print(parser(float_str_3)) + print(parser(float_str_4)) + +The printout will be: + +.. code-block:: + + 42.0 + 42.0 + 42.7 + 42.75 + + +``FloatParser`` will return the float value of the first number in the string, even if it is an integer. + +**ListParser** + +.. code-block:: python + + from lightrag.core.string_parser import ListParser + + list_str = '["key", "value"]' + list_str_2 = 'prefix["key", 2]...' + list_str_3 = '[{"key": "value"}, {"key": "value"}]' + + parser = ListParser() + print(parser(list_str)) + print(parser(list_str_2)) + print(parser(list_str_3)) -Parser builts on top of that +The output will be: +.. code-block:: python + + ['key', 'value'] + ['key', 2] + [{'key': 'value'}, {'key': 'value'}] + +**JsonParser** + +Even though it can work on lists, it is better to only use it for dictionaries. + +.. code-block:: python + + from lightrag.core.string_parser import JsonParser + + dict_str = '{"key": "value"}' + nested_dict_str = ( + '{"name": "John", "age": 30, "attributes": {"height": 180, "weight": 70}}' + ) + list_str = '["key", 2]' + list_dict_str = '[{"key": "value"}, {"key": "value"}]' + + parser = JsonParser() + print(parser) + print(parser(dict_str)) + print(parser(nested_dict_str)) + print(parser(list_str)) + print(parser(list_dict_str)) + +The output will be: + +.. code-block:: python + + {'key': 'value'} + {'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}} + ['key', 2] + [{'key': 'value'}, {'key': 'value'}] + +**YamlParser** + +Though it works almost on all of the previous examples, it is better to use it for yaml formatted dictionaries. + +.. code-block:: python + + from lightrag.core.string_parser import YamlParser + + yaml_dict_str = "key: value" + yaml_nested_dict_str = ( + "name: John\nage: 30\nattributes:\n height: 180\n weight: 70" + ) + yaml_list_str = "- key\n- value" + + parser = YamlParser() + print(parser) + print(parser(yaml_dict_str)) + print(parser(yaml_nested_dict_str)) + print(parser(yaml_list_str)) + +The output will be: + +.. code-block:: python + + {'key': 'value'} + {'name': 'John', 'age': 30, 'attributes': {'height': 180, 'weight': 70}} + ['key', 'value'] + +.. note:: + All parsers will raise ``ValueError`` if it fails at any step. Developers should process it accordingly. Output Parsers in Action -------------------------- -Evaluate Format following --------------------------- +.. # todo +.. Evaluate Format following +.. -------------------------- -.. admonition:: References - :class: highlight +.. .. admonition:: References +.. :class: highlight - .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ - .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ +.. .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ +.. .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ .. admonition:: API References @@ -127,3 +344,5 @@ Evaluate Format following - :ref:`string_parser` - :ref:`OutputParser` + - :class:`components.output_parsers.outputs.JsonOutputParser` + - :class:`components.output_parsers.outputs.YamlOutputParser` diff --git a/lightrag/lightrag/core/functional.py b/lightrag/lightrag/core/functional.py index 4c94401d..83417816 100644 --- a/lightrag/lightrag/core/functional.py +++ b/lightrag/lightrag/core/functional.py @@ -834,7 +834,8 @@ def extract_first_float(text: str) -> float: Raises: ValueError: If no float is found in the text. """ - match = re.search(r"\b\d+\.\d+\b", text) + match = re.search(r"\b\d+(\.\d+)?\b", text) + if match: return float(match.group()) raise ValueError("No float found in the text.") @@ -933,6 +934,8 @@ def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: text = text.strip() start = text.find("[") if start == -1: + log.error("No list found in the text.") + # return None raise ValueError("No list found in the text.") # Attempt to find the matching closing bracket @@ -953,6 +956,8 @@ def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: text += "]" end = len(text) - 1 elif end == -1: + log.error("Incomplete list found and add_missing_right_bracket is False.") + # return None raise ValueError( "Incomplete list found and add_missing_right_bracket is False." ) diff --git a/lightrag/lightrag/core/string_parser.py b/lightrag/lightrag/core/string_parser.py index 349d0228..fe0c480c 100644 --- a/lightrag/lightrag/core/string_parser.py +++ b/lightrag/lightrag/core/string_parser.py @@ -144,6 +144,7 @@ def __call__(self, input: str) -> LIST_PARSER_OUTPUT_TYPE: return list_obj except Exception as e: log.error(f"Error at parsing list string with json.loads: {e}") + raise ValueError(f"Error: {e}") JSON_PARSER_OUTPUT_TYPE = Union[Dict[str, object], List[object]] From 4c4df401a06844aa323e7221e0f8f6ea4c2bd7f6 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 19:37:04 -0700 Subject: [PATCH 13/20] update the unready documentation --- docs/source/developer_notes/output_parsers.rst | 1 - docs/source/index.rst | 14 +++++++------- docs/source/tutorials/index.rst | 11 +++++------ 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index fa2bfa7d..72afcc1f 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -8,7 +8,6 @@ Context LLMs output text in string format. Parsing is the process of `extracting` and `converting` the string to desired data structure per the use case. - This desired data structure can be: - simple data types like string, int, float, boolean, etc. diff --git a/docs/source/index.rst b/docs/source/index.rst index 72033d89..8f80ffe4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -157,6 +157,7 @@ Researchers will want their code to be adapted into more products too. .. :hidden: +.. Hide the use cases for now .. toctree:: :maxdepth: 1 :caption: Use Cases - How different parts are used to build various LLM applications @@ -184,11 +185,10 @@ Researchers will want their code to be adapted into more products too. .. resources/index +.. hide the for contributors now +.. :glob: +.. :maxdepth: 1 +.. :caption: For Contributors +.. :hidden: -.. toctree:: - :glob: - :maxdepth: 1 - :caption: For Contributors - :hidden: - - contributor/index +.. contributor/index diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index dc0edc48..c174f250 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -3,12 +3,11 @@ Use Cases ============================= -*How different parts are used to build various LLM applications* +How different parts are used to build various LLM applications. -* +**The documentation of this section is coming soon.** -.. toctree:: - :maxdepth: 2 +.. :maxdepth: 2 - eval_a_rag - introduction_to_basedataclass \ No newline at end of file +.. eval_a_rag +.. introduction_to_basedataclass From af9bbfe6600dbcd41c9c5d897e141ceff75db080 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 19:46:27 -0700 Subject: [PATCH 14/20] simplify api reference --- .../apis/components/components.agent.rst | 2 +- ...components.data_process.data_components.rst | 9 --------- ...mponents.data_process.document_splitter.rst | 2 +- .../components/components.data_process.rst | 2 +- .../components.data_process.text_splitter.rst | 9 --------- .../components/components.memory.memory.rst | 9 --------- .../apis/components/components.memory.rst | 2 +- .../components/components.model_client.rst | 2 +- .../components.model_client.utils.rst | 9 --------- .../components/components.output_parsers.rst | 2 +- .../apis/components/components.reasoning.rst | 2 +- ...components.retriever.postgres_retriever.rst | 9 --------- .../apis/components/components.retriever.rst | 2 +- docs/source/apis/core/core.base_data_class.rst | 2 +- docs/source/apis/core/core.component.rst | 2 +- docs/source/apis/core/core.data_components.rst | 2 +- docs/source/apis/core/core.db.rst | 2 +- .../apis/core/core.default_prompt_template.rst | 2 +- .../apis/core/core.document_splitter.rst | 2 +- docs/source/apis/core/core.embedder.rst | 2 +- docs/source/apis/core/core.func_tool.rst | 2 +- docs/source/apis/core/core.functional.rst | 2 +- docs/source/apis/core/core.generator.rst | 2 +- docs/source/apis/core/core.memory.rst | 2 +- docs/source/apis/core/core.model_client.rst | 2 +- docs/source/apis/core/core.parameter.rst | 2 +- docs/source/apis/core/core.prompt_builder.rst | 2 +- docs/source/apis/core/core.retriever.rst | 2 +- docs/source/apis/core/core.string_parser.rst | 2 +- docs/source/apis/core/core.tokenizer.rst | 2 +- docs/source/apis/core/core.tool_manager.rst | 2 +- docs/source/apis/core/core.types.rst | 2 +- .../source/apis/eval/eval.answer_match_acc.rst | 2 +- docs/source/apis/eval/eval.evaluators.rst | 2 +- docs/source/apis/eval/eval.llm_as_judge.rst | 2 +- .../source/apis/eval/eval.retriever_recall.rst | 2 +- .../apis/eval/eval.retriever_relevance.rst | 2 +- docs/source/apis/index.rst | 18 +++++++++++++----- .../apis/optim/optim.few_shot_optimizer.rst | 2 +- docs/source/apis/optim/optim.llm_augment.rst | 2 +- docs/source/apis/optim/optim.llm_optimizer.rst | 2 +- docs/source/apis/optim/optim.optimizer.rst | 2 +- docs/source/apis/optim/optim.sampler.rst | 2 +- .../source/apis/tracing/tracing.decorators.rst | 2 +- .../tracing/tracing.generator_call_logger.rst | 2 +- .../tracing/tracing.generator_state_logger.rst | 2 +- docs/source/apis/utils/utils.config.rst | 2 +- docs/source/apis/utils/utils.file_io.rst | 2 +- docs/source/apis/utils/utils.lazy_import.rst | 2 +- docs/source/apis/utils/utils.logger.rst | 2 +- docs/source/apis/utils/utils.registry.rst | 2 +- docs/source/apis/utils/utils.serialization.rst | 2 +- docs/source/apis/utils/utils.setup_env.rst | 2 +- 53 files changed, 60 insertions(+), 97 deletions(-) delete mode 100644 docs/source/apis/components/components.data_process.data_components.rst delete mode 100644 docs/source/apis/components/components.data_process.text_splitter.rst delete mode 100644 docs/source/apis/components/components.memory.memory.rst delete mode 100644 docs/source/apis/components/components.model_client.utils.rst delete mode 100644 docs/source/apis/components/components.retriever.postgres_retriever.rst diff --git a/docs/source/apis/components/components.agent.rst b/docs/source/apis/components/components.agent.rst index 3bea7354..fde2b8a2 100644 --- a/docs/source/apis/components/components.agent.rst +++ b/docs/source/apis/components/components.agent.rst @@ -1,6 +1,6 @@ .. _components-agent: -components.agent +agent ======================== Submodules diff --git a/docs/source/apis/components/components.data_process.data_components.rst b/docs/source/apis/components/components.data_process.data_components.rst deleted file mode 100644 index c567436c..00000000 --- a/docs/source/apis/components/components.data_process.data_components.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _components-data_process-data_components: - -components.data\_process.data\_components -================================================ - -.. automodule:: components.data_process.data_components - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/apis/components/components.data_process.document_splitter.rst b/docs/source/apis/components/components.data_process.document_splitter.rst index 8d9c0f3a..06832c3a 100644 --- a/docs/source/apis/components/components.data_process.document_splitter.rst +++ b/docs/source/apis/components/components.data_process.document_splitter.rst @@ -1,6 +1,6 @@ .. _components-data_process-document_splitter: -components.data\_process.document\_splitter +document\_splitter ================================================== .. automodule:: components.data_process.document_splitter diff --git a/docs/source/apis/components/components.data_process.rst b/docs/source/apis/components/components.data_process.rst index 1b444e9f..72ff7ae3 100644 --- a/docs/source/apis/components/components.data_process.rst +++ b/docs/source/apis/components/components.data_process.rst @@ -1,6 +1,6 @@ .. _components-data_process: -components.data\_process +data\_process ================================ Submodules diff --git a/docs/source/apis/components/components.data_process.text_splitter.rst b/docs/source/apis/components/components.data_process.text_splitter.rst deleted file mode 100644 index a5ceea86..00000000 --- a/docs/source/apis/components/components.data_process.text_splitter.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _components-data_process-text_splitter: - -components.data\_process.text\_splitter -============================================== - -.. automodule:: components.data_process.text_splitter - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/apis/components/components.memory.memory.rst b/docs/source/apis/components/components.memory.memory.rst deleted file mode 100644 index d6ca5051..00000000 --- a/docs/source/apis/components/components.memory.memory.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _components-memory-memory: - -components.memory.memory -=============================== - -.. automodule:: components.memory.memory - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/apis/components/components.memory.rst b/docs/source/apis/components/components.memory.rst index 4505a185..68fef512 100644 --- a/docs/source/apis/components/components.memory.rst +++ b/docs/source/apis/components/components.memory.rst @@ -1,6 +1,6 @@ .. _components-memory: -components.memory +memory ========================= Submodules diff --git a/docs/source/apis/components/components.model_client.rst b/docs/source/apis/components/components.model_client.rst index a8fb35d8..57c78622 100644 --- a/docs/source/apis/components/components.model_client.rst +++ b/docs/source/apis/components/components.model_client.rst @@ -1,6 +1,6 @@ .. _components-model_client: -components.model\_client +model\_client ================================ Submodules diff --git a/docs/source/apis/components/components.model_client.utils.rst b/docs/source/apis/components/components.model_client.utils.rst deleted file mode 100644 index 53ae77d2..00000000 --- a/docs/source/apis/components/components.model_client.utils.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _components-model_client-utils: - -components.model\_client.utils -===================================== - -.. automodule:: components.model_client.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/apis/components/components.output_parsers.rst b/docs/source/apis/components/components.output_parsers.rst index 729afabd..2810e80c 100644 --- a/docs/source/apis/components/components.output_parsers.rst +++ b/docs/source/apis/components/components.output_parsers.rst @@ -1,6 +1,6 @@ .. _components-output_parsers: -components.output\_parsers +output\_parsers ================================== Submodules diff --git a/docs/source/apis/components/components.reasoning.rst b/docs/source/apis/components/components.reasoning.rst index 6c79f3cd..b17e5112 100644 --- a/docs/source/apis/components/components.reasoning.rst +++ b/docs/source/apis/components/components.reasoning.rst @@ -1,6 +1,6 @@ .. _components-reasoning: -components.reasoning +reasoning ============================ Submodules diff --git a/docs/source/apis/components/components.retriever.postgres_retriever.rst b/docs/source/apis/components/components.retriever.postgres_retriever.rst deleted file mode 100644 index 099bd303..00000000 --- a/docs/source/apis/components/components.retriever.postgres_retriever.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _components-retriever-postgres_retriever: - -components.retriever.postgres\_retriever -=============================================== - -.. automodule:: components.retriever.postgres_retriever - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/apis/components/components.retriever.rst b/docs/source/apis/components/components.retriever.rst index fdc4e1bd..27c507de 100644 --- a/docs/source/apis/components/components.retriever.rst +++ b/docs/source/apis/components/components.retriever.rst @@ -1,6 +1,6 @@ .. _components-retriever: -components.retriever +retriever ============================ Submodules diff --git a/docs/source/apis/core/core.base_data_class.rst b/docs/source/apis/core/core.base_data_class.rst index 8e629861..30fbfd14 100644 --- a/docs/source/apis/core/core.base_data_class.rst +++ b/docs/source/apis/core/core.base_data_class.rst @@ -1,6 +1,6 @@ .. _core-base_data_class: -core.base\_data\_class +base\_data\_class ============================= .. automodule:: core.base_data_class diff --git a/docs/source/apis/core/core.component.rst b/docs/source/apis/core/core.component.rst index aab446e2..dad827c7 100644 --- a/docs/source/apis/core/core.component.rst +++ b/docs/source/apis/core/core.component.rst @@ -1,6 +1,6 @@ .. _core-component: -core.component +component ===================== .. automodule:: core.component diff --git a/docs/source/apis/core/core.data_components.rst b/docs/source/apis/core/core.data_components.rst index f75d1991..f3deeb8a 100644 --- a/docs/source/apis/core/core.data_components.rst +++ b/docs/source/apis/core/core.data_components.rst @@ -1,6 +1,6 @@ .. _core-data_components: -core.data\_components +data\_components ============================ .. automodule:: core.data_components diff --git a/docs/source/apis/core/core.db.rst b/docs/source/apis/core/core.db.rst index ca74e130..4c9f8c1b 100644 --- a/docs/source/apis/core/core.db.rst +++ b/docs/source/apis/core/core.db.rst @@ -1,6 +1,6 @@ .. _core-db: -core.db +db ============== .. automodule:: core.db diff --git a/docs/source/apis/core/core.default_prompt_template.rst b/docs/source/apis/core/core.default_prompt_template.rst index e9f538fa..522ad037 100644 --- a/docs/source/apis/core/core.default_prompt_template.rst +++ b/docs/source/apis/core/core.default_prompt_template.rst @@ -1,6 +1,6 @@ .. _core-default_prompt_template: -core.default\_prompt\_template +default\_prompt\_template ===================================== .. automodule:: core.default_prompt_template diff --git a/docs/source/apis/core/core.document_splitter.rst b/docs/source/apis/core/core.document_splitter.rst index 48c1607a..77672d83 100644 --- a/docs/source/apis/core/core.document_splitter.rst +++ b/docs/source/apis/core/core.document_splitter.rst @@ -1,6 +1,6 @@ .. _core-document_splitter: -core.document\_splitter +document\_splitter ============================== .. automodule:: core.document_splitter diff --git a/docs/source/apis/core/core.embedder.rst b/docs/source/apis/core/core.embedder.rst index 195bb6c3..c7078e0a 100644 --- a/docs/source/apis/core/core.embedder.rst +++ b/docs/source/apis/core/core.embedder.rst @@ -1,6 +1,6 @@ .. _core-embedder: -core.embedder +embedder ==================== .. automodule:: core.embedder diff --git a/docs/source/apis/core/core.func_tool.rst b/docs/source/apis/core/core.func_tool.rst index 9e3c5e3e..f8ab69fc 100644 --- a/docs/source/apis/core/core.func_tool.rst +++ b/docs/source/apis/core/core.func_tool.rst @@ -1,6 +1,6 @@ .. _core-func_tool: -core.func\_tool +func\_tool ====================== .. automodule:: core.func_tool diff --git a/docs/source/apis/core/core.functional.rst b/docs/source/apis/core/core.functional.rst index 222c411c..cb264ccf 100644 --- a/docs/source/apis/core/core.functional.rst +++ b/docs/source/apis/core/core.functional.rst @@ -1,6 +1,6 @@ .. _core-functional: -core.functional +functional ====================== .. automodule:: core.functional diff --git a/docs/source/apis/core/core.generator.rst b/docs/source/apis/core/core.generator.rst index df0c8e38..60d5781c 100644 --- a/docs/source/apis/core/core.generator.rst +++ b/docs/source/apis/core/core.generator.rst @@ -1,6 +1,6 @@ .. _core-generator: -core.generator +generator ===================== .. automodule:: core.generator diff --git a/docs/source/apis/core/core.memory.rst b/docs/source/apis/core/core.memory.rst index 178062ab..00e3de26 100644 --- a/docs/source/apis/core/core.memory.rst +++ b/docs/source/apis/core/core.memory.rst @@ -1,6 +1,6 @@ .. _core-memory: -core.memory +memory ================== .. automodule:: core.memory diff --git a/docs/source/apis/core/core.model_client.rst b/docs/source/apis/core/core.model_client.rst index d4fb3a6d..4a61fd5e 100644 --- a/docs/source/apis/core/core.model_client.rst +++ b/docs/source/apis/core/core.model_client.rst @@ -1,6 +1,6 @@ .. _core-model_client: -core.model\_client +model\_client ========================= .. automodule:: core.model_client diff --git a/docs/source/apis/core/core.parameter.rst b/docs/source/apis/core/core.parameter.rst index 467c4b33..13e2bd01 100644 --- a/docs/source/apis/core/core.parameter.rst +++ b/docs/source/apis/core/core.parameter.rst @@ -1,6 +1,6 @@ .. _core-parameter: -core.parameter +parameter ===================== .. automodule:: core.parameter diff --git a/docs/source/apis/core/core.prompt_builder.rst b/docs/source/apis/core/core.prompt_builder.rst index 247ae199..26c80b01 100644 --- a/docs/source/apis/core/core.prompt_builder.rst +++ b/docs/source/apis/core/core.prompt_builder.rst @@ -1,6 +1,6 @@ .. _core-prompt_builder: -core.prompt\_builder +prompt\_builder =========================== .. automodule:: core.prompt_builder diff --git a/docs/source/apis/core/core.retriever.rst b/docs/source/apis/core/core.retriever.rst index 2182485e..5618703c 100644 --- a/docs/source/apis/core/core.retriever.rst +++ b/docs/source/apis/core/core.retriever.rst @@ -1,6 +1,6 @@ .. _core-retriever: -core.retriever +retriever ===================== .. automodule:: core.retriever diff --git a/docs/source/apis/core/core.string_parser.rst b/docs/source/apis/core/core.string_parser.rst index e9527602..14d2a06e 100644 --- a/docs/source/apis/core/core.string_parser.rst +++ b/docs/source/apis/core/core.string_parser.rst @@ -1,6 +1,6 @@ .. _core-string_parser: -core.string\_parser +string\_parser ========================== .. automodule:: core.string_parser diff --git a/docs/source/apis/core/core.tokenizer.rst b/docs/source/apis/core/core.tokenizer.rst index de663456..815d65c4 100644 --- a/docs/source/apis/core/core.tokenizer.rst +++ b/docs/source/apis/core/core.tokenizer.rst @@ -1,6 +1,6 @@ .. _core-tokenizer: -core.tokenizer +tokenizer ===================== .. automodule:: core.tokenizer diff --git a/docs/source/apis/core/core.tool_manager.rst b/docs/source/apis/core/core.tool_manager.rst index 852c3265..997401a8 100644 --- a/docs/source/apis/core/core.tool_manager.rst +++ b/docs/source/apis/core/core.tool_manager.rst @@ -1,6 +1,6 @@ .. _core-tool_manager: -core.tool\_manager +tool\_manager ========================= .. automodule:: core.tool_manager diff --git a/docs/source/apis/core/core.types.rst b/docs/source/apis/core/core.types.rst index 328324cb..b999b5bf 100644 --- a/docs/source/apis/core/core.types.rst +++ b/docs/source/apis/core/core.types.rst @@ -1,6 +1,6 @@ .. _core-types: -core.types +types ================= .. automodule:: core.types diff --git a/docs/source/apis/eval/eval.answer_match_acc.rst b/docs/source/apis/eval/eval.answer_match_acc.rst index ec068ca7..b6a0e05b 100644 --- a/docs/source/apis/eval/eval.answer_match_acc.rst +++ b/docs/source/apis/eval/eval.answer_match_acc.rst @@ -1,6 +1,6 @@ .. _eval-answer_match_acc: -eval.answer\_match\_acc +answer\_match\_acc ============================== .. automodule:: eval.answer_match_acc diff --git a/docs/source/apis/eval/eval.evaluators.rst b/docs/source/apis/eval/eval.evaluators.rst index d6ccf71b..ac0de161 100644 --- a/docs/source/apis/eval/eval.evaluators.rst +++ b/docs/source/apis/eval/eval.evaluators.rst @@ -1,6 +1,6 @@ .. _eval-evaluators: -eval.evaluators +evaluators ====================== .. automodule:: eval.evaluators diff --git a/docs/source/apis/eval/eval.llm_as_judge.rst b/docs/source/apis/eval/eval.llm_as_judge.rst index e9da7032..77774ed3 100644 --- a/docs/source/apis/eval/eval.llm_as_judge.rst +++ b/docs/source/apis/eval/eval.llm_as_judge.rst @@ -1,6 +1,6 @@ .. _eval-llm_as_judge: -eval.llm\_as\_judge +llm\_as\_judge ========================== .. automodule:: eval.llm_as_judge diff --git a/docs/source/apis/eval/eval.retriever_recall.rst b/docs/source/apis/eval/eval.retriever_recall.rst index 1e1b7f4d..1eade14a 100644 --- a/docs/source/apis/eval/eval.retriever_recall.rst +++ b/docs/source/apis/eval/eval.retriever_recall.rst @@ -1,6 +1,6 @@ .. _eval-retriever_recall: -eval.retriever\_recall +retriever\_recall ============================= .. automodule:: eval.retriever_recall diff --git a/docs/source/apis/eval/eval.retriever_relevance.rst b/docs/source/apis/eval/eval.retriever_relevance.rst index 737de569..bd2c721b 100644 --- a/docs/source/apis/eval/eval.retriever_relevance.rst +++ b/docs/source/apis/eval/eval.retriever_relevance.rst @@ -1,6 +1,6 @@ .. _eval-retriever_relevance: -eval.retriever\_relevance +retriever\_relevance ================================ .. automodule:: eval.retriever_relevance diff --git a/docs/source/apis/index.rst b/docs/source/apis/index.rst index 5f5f7485..12f34756 100644 --- a/docs/source/apis/index.rst +++ b/docs/source/apis/index.rst @@ -1,25 +1,31 @@ API Reference ============= -Welcome to the LightRAG API reference! This section provides detailed documentation of the internal APIs that make up the LightRAG framework. Explore the APIs to understand how to effectively utilize and integrate LightRAG components into your projects. +Welcome to the `LightRAG`. +The API reference is organized by subdirectories. + +.. This section provides detailed documentation of the internal APIs that make up the LightRAG framework. Explore the APIs to understand how to effectively utilize and integrate LightRAG components into your projects. Core ---------- +All base/abstract classes, core components like generator, embedder, and basic functions are here. + -The core section of the LightRAG API documentation provides detailed information about the foundational components of the LightRAG system. These components are essential for the basic operations and serve as the building blocks for higher-level functionalities. +.. The core section of the LightRAG API documentation provides detailed information about the foundational components of the LightRAG system. +.. These components are essential for the basic operations and serve as the building blocks for higher-level functionalities. .. autosummary:: core.component core.base_data_class core.default_prompt_template core.model_client - + .. core.data_components core.db core.functional - + core.generator core.string_parser core.embedder @@ -35,8 +41,10 @@ The core section of the LightRAG API documentation provides detailed information Components ----------- +Functional components like model client, retriever, agent, local data processing, and output parsers are here. -The components section of the LightRAG API documentation outlines the detailed specifications and functionalities of various API components. Each component plays a crucial role in the LightRAG framework, providing specialized capabilities and interactions. +.. The components section of the LightRAG API documentation outlines the detailed specifications and functionalities of various API components. +.. Each component plays a crucial role in the LightRAG framework, providing specialized capabilities and interactions. .. autosummary:: diff --git a/docs/source/apis/optim/optim.few_shot_optimizer.rst b/docs/source/apis/optim/optim.few_shot_optimizer.rst index 352302ea..d411e628 100644 --- a/docs/source/apis/optim/optim.few_shot_optimizer.rst +++ b/docs/source/apis/optim/optim.few_shot_optimizer.rst @@ -1,6 +1,6 @@ .. _optim-few_shot_optimizer: -optim.few\_shot\_optimizer +few\_shot\_optimizer ================================= .. automodule:: optim.few_shot_optimizer diff --git a/docs/source/apis/optim/optim.llm_augment.rst b/docs/source/apis/optim/optim.llm_augment.rst index ce88ae79..44ff64d7 100644 --- a/docs/source/apis/optim/optim.llm_augment.rst +++ b/docs/source/apis/optim/optim.llm_augment.rst @@ -1,6 +1,6 @@ .. _optim-llm_augment: -optim.llm\_augment +llm\_augment ========================= .. automodule:: optim.llm_augment diff --git a/docs/source/apis/optim/optim.llm_optimizer.rst b/docs/source/apis/optim/optim.llm_optimizer.rst index 981ca228..8c27727a 100644 --- a/docs/source/apis/optim/optim.llm_optimizer.rst +++ b/docs/source/apis/optim/optim.llm_optimizer.rst @@ -1,6 +1,6 @@ .. _optim-llm_optimizer: -optim.llm\_optimizer +llm\_optimizer =========================== .. automodule:: optim.llm_optimizer diff --git a/docs/source/apis/optim/optim.optimizer.rst b/docs/source/apis/optim/optim.optimizer.rst index 2a0e799a..8c4f8c0c 100644 --- a/docs/source/apis/optim/optim.optimizer.rst +++ b/docs/source/apis/optim/optim.optimizer.rst @@ -1,6 +1,6 @@ .. _optim-optimizer: -optim.optimizer +optimizer ====================== .. automodule:: optim.optimizer diff --git a/docs/source/apis/optim/optim.sampler.rst b/docs/source/apis/optim/optim.sampler.rst index 207a4cb6..4d48369e 100644 --- a/docs/source/apis/optim/optim.sampler.rst +++ b/docs/source/apis/optim/optim.sampler.rst @@ -1,6 +1,6 @@ .. _optim-sampler: -optim.sampler +sampler ==================== .. automodule:: optim.sampler diff --git a/docs/source/apis/tracing/tracing.decorators.rst b/docs/source/apis/tracing/tracing.decorators.rst index 688c1786..b2891b54 100644 --- a/docs/source/apis/tracing/tracing.decorators.rst +++ b/docs/source/apis/tracing/tracing.decorators.rst @@ -1,6 +1,6 @@ .. _tracing-decorators: -tracing.decorators +decorators ========================= .. automodule:: tracing.decorators diff --git a/docs/source/apis/tracing/tracing.generator_call_logger.rst b/docs/source/apis/tracing/tracing.generator_call_logger.rst index db70d6b6..80578c48 100644 --- a/docs/source/apis/tracing/tracing.generator_call_logger.rst +++ b/docs/source/apis/tracing/tracing.generator_call_logger.rst @@ -1,6 +1,6 @@ .. _tracing-generator_call_logger: -tracing.generator\_call\_logger +generator\_call\_logger ====================================== .. automodule:: tracing.generator_call_logger diff --git a/docs/source/apis/tracing/tracing.generator_state_logger.rst b/docs/source/apis/tracing/tracing.generator_state_logger.rst index 1b562a96..4a2bfa34 100644 --- a/docs/source/apis/tracing/tracing.generator_state_logger.rst +++ b/docs/source/apis/tracing/tracing.generator_state_logger.rst @@ -1,6 +1,6 @@ .. _tracing-generator_state_logger: -tracing.generator\_state\_logger +generator\_state\_logger ======================================= .. automodule:: tracing.generator_state_logger diff --git a/docs/source/apis/utils/utils.config.rst b/docs/source/apis/utils/utils.config.rst index a786dc7e..c4350e96 100644 --- a/docs/source/apis/utils/utils.config.rst +++ b/docs/source/apis/utils/utils.config.rst @@ -1,6 +1,6 @@ .. _utils-config: -utils.config +config =================== .. automodule:: utils.config diff --git a/docs/source/apis/utils/utils.file_io.rst b/docs/source/apis/utils/utils.file_io.rst index 0b3ffb8b..a6bd2610 100644 --- a/docs/source/apis/utils/utils.file_io.rst +++ b/docs/source/apis/utils/utils.file_io.rst @@ -1,6 +1,6 @@ .. _utils-file_io: -utils.file\_io +file\_io ===================== .. automodule:: utils.file_io diff --git a/docs/source/apis/utils/utils.lazy_import.rst b/docs/source/apis/utils/utils.lazy_import.rst index b76130ea..309b4bc4 100644 --- a/docs/source/apis/utils/utils.lazy_import.rst +++ b/docs/source/apis/utils/utils.lazy_import.rst @@ -1,6 +1,6 @@ .. _utils-lazy_import: -utils.lazy\_import +lazy\_import ========================= .. automodule:: utils.lazy_import diff --git a/docs/source/apis/utils/utils.logger.rst b/docs/source/apis/utils/utils.logger.rst index 4d9d9d42..5ad622fc 100644 --- a/docs/source/apis/utils/utils.logger.rst +++ b/docs/source/apis/utils/utils.logger.rst @@ -1,6 +1,6 @@ .. _utils-logger: -utils.logger +logger =================== .. automodule:: utils.logger diff --git a/docs/source/apis/utils/utils.registry.rst b/docs/source/apis/utils/utils.registry.rst index 77a8095e..c8426005 100644 --- a/docs/source/apis/utils/utils.registry.rst +++ b/docs/source/apis/utils/utils.registry.rst @@ -1,6 +1,6 @@ .. _utils-registry: -utils.registry +registry ===================== .. automodule:: utils.registry diff --git a/docs/source/apis/utils/utils.serialization.rst b/docs/source/apis/utils/utils.serialization.rst index 61980a68..38076a9a 100644 --- a/docs/source/apis/utils/utils.serialization.rst +++ b/docs/source/apis/utils/utils.serialization.rst @@ -1,6 +1,6 @@ .. _utils-serialization: -utils.serialization +serialization ========================== .. automodule:: utils.serialization diff --git a/docs/source/apis/utils/utils.setup_env.rst b/docs/source/apis/utils/utils.setup_env.rst index dc669ad5..c4a7241a 100644 --- a/docs/source/apis/utils/utils.setup_env.rst +++ b/docs/source/apis/utils/utils.setup_env.rst @@ -1,6 +1,6 @@ .. _utils-setup_env: -utils.setup\_env +setup\_env ======================= .. automodule:: utils.setup_env From f9499892181d068bee6700583920278f8f66187a Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 20:47:16 -0700 Subject: [PATCH 15/20] complete first version parser --- developer_notes/parser_note.py | 46 +++++ .../source/developer_notes/output_parsers.rst | 160 +++++++++++++++++- .../components/output_parsers/outputs.py | 28 +-- 3 files changed, 223 insertions(+), 11 deletions(-) diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py index 56b8d3bb..6e3a2a7f 100644 --- a/developer_notes/parser_note.py +++ b/developer_notes/parser_note.py @@ -227,6 +227,50 @@ def yaml_parser(): print(parser(yaml_list_str)) +def json_output_parser(): + from dataclasses import dataclass, field + from lightrag.components.output_parsers import JsonOutputParser + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + + user_to_parse = '{"id": 2, "name": "Jane"}' + + parser = JsonOutputParser(data_class=User, examples=[user_example]) + print(parser) + output_format_str = parser.format_instructions() + print(output_format_str) + parsed_user = parser(user_to_parse) + print(parsed_user) + + +def yaml_output_parser(): + from dataclasses import dataclass, field + from lightrag.components.output_parsers import YamlOutputParser + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + + user_to_parse = "id: 2\nname: Jane" + + parser = YamlOutputParser(data_class=User, examples=[user_example]) + print(parser) + output_format_str = parser.format_instructions() + print(output_format_str) + parsed_user = parser(user_to_parse) + print(parsed_user) + + if __name__ == "__main__": examples_of_different_ways_to_parse_string() int_parser() @@ -235,3 +279,5 @@ def yaml_parser(): list_parser() json_parser() yaml_parser() + json_output_parser() + yaml_output_parser() diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 72afcc1f..25ddc738 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -141,7 +141,7 @@ Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arg - ``examples``: the examples of the data class instance if you want to show the examples in the prompt. - ``exclude``: the fields to exclude from both the data format and the examples. -.. TODO: a summary table +.. TODO: a summary table and a diagram Parser in Action ------------------ @@ -327,6 +327,164 @@ The output will be: Output Parsers in Action -------------------------- + +We will create the following simple ``DataClass`` with one example. +And we will demonstrate how to use ``JsonOutputParser`` and ``YamlOutputParser`` to parse another example to dict object. + +.. code-block:: python + + from dataclasses import dataclass, field + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + +**JsonOutputParser** + +Here is how to use ``JsonOutputParser``: + +.. code-block:: python + + from lightrag.components.output_parsers import JsonOutputParser + + parser = JsonOutputParser(data_class=User, examples=[user_example]) + print(parser) + +The structure of it: + +.. code-block:: + + JsonOutputParser( + data_class=User, examples=[json_output_parser..User(id=1, name='John')], exclude_fields=None + (json_output_format_prompt): Prompt( + template: Your output should be formatted as a standard JSON instance with the following schema: + ``` + {{schema}} + ``` + {% if example %} + Examples: + ``` + {{example}} + ``` + {% endif %} + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema'] + ) + (output_processors): JsonParser() + ) + +The output format string will be: + +.. code-block:: + + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "id": " (int) (optional)", + "name": " (str) (optional)" + } + ``` + Examples: + ``` + { + "id": 1, + "name": "John" + } + ________ + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions. + +Call the parser with the following string: + +.. code-block:: python + + user_to_parse = '{"id": 2, "name": "Jane"}' + parsed_user = parser(user_to_parse) + print(parsed_user) + +The output will be: + +.. code-block:: python + + {'id': 2, 'name': 'Jane'} + +**YamlOutputParser** + +The steps are totally the same as the ``JsonOutputParser``. + +.. code-block:: python + + from lightrag.components.output_parsers import YamlOutputParser + + parser = YamlOutputParser(data_class=User, examples=[user_example]) + print(parser) + +The structure of it: + +.. code-block:: + + YamlOutputParser( + data_class=.User'>, examples=[yaml_output_parser..User(id=1, name='John')] + (yaml_output_format_prompt): Prompt( + template: Your output should be formatted as a standard YAML instance with the following schema: + ``` + {{schema}} + ``` + {% if example %} + Examples: + ``` + {{example}} + ``` + {% endif %} + + -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output! + -Follow the YAML formatting conventions with an indent of 2 spaces. + -Quote the string values properly., prompt_variables: ['schema', 'example'] + ) + (output_processors): YamlParser() + ) + +The output format string will be: + +.. code-block:: + + Your output should be formatted as a standard YAML instance with the following schema: + ``` + id: (int) (optional) + name: (str) (optional) + ``` + Examples: + ``` + id: 1 + name: John + + ________ + ``` + + -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output! + -Follow the YAML formatting conventions with an indent of 2 spaces. + -Quote the string values properly. + +Now, let us parse the following string: + +.. code-block:: python + + user_to_parse = "id: 2\nname: Jane" + parsed_user = parser(user_to_parse) + print(parsed_user) + +The output will be: + +.. code-block:: python + + {'id': 2, 'name': 'Jane'} .. # todo .. Evaluate Format following .. -------------------------- diff --git a/lightrag/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py index 337c8570..211835e9 100644 --- a/lightrag/lightrag/components/output_parsers/outputs.py +++ b/lightrag/lightrag/components/output_parsers/outputs.py @@ -51,10 +51,7 @@ -Quote the string values properly.""" LIST_OUTPUT_FORMAT = r"""Your output should be formatted as a standard Python list. --Each element can be of any Python data type such as string, integer, float, list, dictionary, etc. --You can also have nested lists and dictionaries. --Please do not add anything other than valid Python list output! -""" +- Start the list with '[' and end with ']'""" YAML_OUTPUT_PARSER_OUTPUT_TYPE = Dict[str, Any] @@ -139,13 +136,18 @@ def __init__( if not is_dataclass(data_class): raise ValueError(f"Provided class is not a dataclass: {data_class}") + if not issubclass(data_class, DataClass): + raise ValueError( + f"Provided class is not a subclass of DataClass: {data_class}" + ) + # ensure example is instance of data class and initiated if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) self._exclude_fields = exclude_fields - self.data_class_for_yaml: DataClass = data_class + self.data_class: DataClass = data_class self.yaml_output_format_prompt = Prompt(template=YAML_OUTPUT_FORMAT) self.output_processors = YamlParser() self.examples = examples @@ -163,7 +165,7 @@ def format_instructions( exclude (List[str], optional): The fields to exclude from the schema of the data class. """ format_type = format_type or DataClassFormatType.SIGNATURE_YAML - schema = self.data_class_for_yaml.format_class_str( + schema = self.data_class.format_class_str( format_type=format_type, exclude=self._exclude_fields ) # convert example to string, convert data class to yaml string @@ -189,7 +191,7 @@ def call(self, input: str) -> YAML_OUTPUT_PARSER_OUTPUT_TYPE: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_yaml={self.data_class_for_yaml}, examples={self.examples}" + s = f"data_class={self.data_class}, examples={self.examples}" return s @@ -204,13 +206,18 @@ def __init__( if not is_dataclass(data_class): raise ValueError(f"Provided class is not a dataclass: {data_class}") + if not issubclass(data_class, DataClass): + raise ValueError( + f"Provided class is not a subclass of DataClass: {data_class}" + ) + if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) self._exclude_fields = exclude_fields template = JSON_OUTPUT_FORMAT - self.data_class_for_json: DataClass = data_class + self.data_class: DataClass = data_class self.json_output_format_prompt = Prompt(template=template) self.output_processors = JsonParser() self.examples = examples @@ -228,7 +235,7 @@ def format_instructions( Options: DataClassFormatType.SIGNATURE_YAML, DataClassFormatType.SIGNATURE_JSON, DataClassFormatType.SCHEMA. """ format_type = format_type or DataClassFormatType.SIGNATURE_JSON - schema = self.data_class_for_json.format_class_str( + schema = self.data_class.format_class_str( format_type=format_type, exclude=self._exclude_fields ) example_str = "" @@ -244,6 +251,7 @@ def format_instructions( log.debug(f"{__class__.__name__} example_str: {example_str}") except Exception: + log.error(f"Error in formatting example for {__class__.__name__}") example_str = None return self.json_output_format_prompt(schema=schema, example=example_str) @@ -251,7 +259,7 @@ def call(self, input: str) -> Any: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_json={self.data_class_for_json}, examples={self.examples}, exclude_fields={self._exclude_fields}" + s = f"""data_class={self.data_class.__name__}, examples={self.examples}, exclude_fields={self._exclude_fields}""" return s From 829a1dcb982b28b3776013a0fa4f86857986ce13 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 21:02:35 -0700 Subject: [PATCH 16/20] add github link in documents --- docs/source/conf.py | 7 ++++++- docs/source/developer_notes/output_parsers.rst | 3 +++ lightrag/lightrag/components/output_parsers/outputs.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 77ae00ff..d9d059b1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -85,9 +85,14 @@ # "includehidden": True, # "titles_only": False, "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/SylphAI-Inc/LightRAG", # Replace with your GitHub URL + "icon": "fa-brands fa-github", + }, { "name": "Discord", - "url": "https://discord.gg/hmZWFEUd", + "url": "https://discord.gg/ezzszrRZvT", "icon": "fa-brands fa-discord", }, ], diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 25ddc738..314766a6 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -503,3 +503,6 @@ The output will be: - :ref:`OutputParser` - :class:`components.output_parsers.outputs.JsonOutputParser` - :class:`components.output_parsers.outputs.YamlOutputParser` + - :class:`components.output_parsers.outputs.OutputParser` + - :class:`components.output_parsers.outputs.BooleanOutputParser` + - :class:`components.output_parsers.outputs.ListOutputParser` diff --git a/lightrag/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py index 211835e9..af5b4f51 100644 --- a/lightrag/lightrag/components/output_parsers/outputs.py +++ b/lightrag/lightrag/components/output_parsers/outputs.py @@ -191,7 +191,7 @@ def call(self, input: str) -> YAML_OUTPUT_PARSER_OUTPUT_TYPE: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class={self.data_class}, examples={self.examples}" + s = f"data_class={self.data_class.__name__}, examples={self.examples}, exclude_fields={self._exclude_fields}" return s From 45e61ad98645e3269f95f36192ee5d367fa97d3b Mon Sep 17 00:00:00 2001 From: Li Yin Date: Wed, 3 Jul 2024 10:19:26 -0700 Subject: [PATCH 17/20] add readme v1 --- README.md | 78 +++++++------ docs/source/get_started/installation.rst | 2 +- lightrag/README.md | 138 +++++++++++++++-------- 3 files changed, 132 insertions(+), 86 deletions(-) diff --git a/README.md b/README.md index 8c93d482..7a7b5d3f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ -# Introduction +![LightRAG Logo](docs/source/_static/images/LightRAG-logo-doc.jpeg) + +⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡ + +We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. +It is *light*, *modular*, and *robust*. + -LightRAG is the `PyTorch` library for building large language model (LLM) applications. We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. -It is light, modular, and robust. **PyTorch** @@ -58,46 +62,46 @@ class SimpleQA(Component): return await self.generator.acall({"input_str": query}) ``` -## Simplicity +## Quick Install -Developers who are building real-world Large Language Model (LLM) applications are the real heroes. -As a library, we provide them with the fundamental building blocks with 100% clarity and simplicity. +Install LightRAG with pip: -* Two fundamental and powerful base classes: Component for the pipeline and DataClass for data interaction with LLMs. -* We end up with less than two levels of subclasses. Class Hierarchy Visualization. -* The result is a library with bare minimum abstraction, providing developers with maximum customizability. +```bash +pip install lightrag +``` -Similar to the PyTorch module, our Component provides excellent visualization of the pipeline structure. +Please refer to the [full installation guide](https://lightrag.sylph.ai/get_started/installation.html) for more details. -``` -SimpleQA( - (generator): Generator( - model_kwargs={'model': 'llama3-8b-8192'}, - (prompt): Prompt( - template: - You are a helpful assistant. - - User: {{input_str}} - You: - , prompt_variables: ['input_str'] - ) - (model_client): GroqAPIClient() - ) -) -``` -## Controllability -Our simplicity did not come from doing 'less'. -On the contrary, we have to do 'more' and go 'deeper' and 'wider' on any topic to offer developers maximum control and robustness. +You can place the above code in your project's root ``__init__.py`` file. +This setup ensures that LightRAG can access all necessary configurations during runtime. -* LLMs are sensitive to the prompt. We allow developers full control over their prompts without relying on API features such as tools and JSON format with components like Prompt, OutputParser, FunctionTool, and ToolManager. -* Our goal is not to optimize for integration, but to provide a robust abstraction with representative examples. See this in ModelClient and Retriever. -* All integrations, such as different API SDKs, are formed as optional packages but all within the same library. You can easily switch to any models from different providers that we officially support. +# Documentation -## Future of LLM Applications +LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sylph.ai/): -On top of the easiness to use, we in particular optimize the configurability of components for researchers to build their solutions and to benchmark existing solutions. -Like how PyTorch has united both researchers and production teams, it enables smooth transition from research to production. -With researchers building on LightRAG, production engineers can easily take over the method and test and iterate on their production data. -Researchers will want their code to be adapted into more products too. +- [Introduction](https://lightrag.sylph.ai/) +- [Full installation guide](https://lightrag.sylph.ai/get_started/installation.html) +- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html) +- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html) +- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html) +- [API reference](https://lightrag.sylph.ai/apis/index.html) + + + +## Contributors + +[![contributors](https://contrib.rocks/image?repo=SylphAI-Inc/LightRAG&max=2000)](https://github.com/SylphAI-Inc/LightRAG/graphs/contributors) + +# Citation + +```bibtex +@software{Yin-LightRAG-2024, + author = {Yin, Li}, + title = {{LightRAG: The PyTorch Library for Large language Model (LLM) Applications}}, + month = {7}, + year = {2024}, + url = {https://github.com/SylphAI-Inc/LightRAG} +} +``` diff --git a/docs/source/get_started/installation.rst b/docs/source/get_started/installation.rst index a0ddb9c4..ef1f3349 100644 --- a/docs/source/get_started/installation.rst +++ b/docs/source/get_started/installation.rst @@ -54,7 +54,7 @@ Or, you can load it yourself with ``python-dotenv``: You can place the above code in your project's root ``__init__.py`` file. This setup ensures that LightRAG can access all necessary configurations during runtime. -1. Install Optional Packages +4. Install Optional Packages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/lightrag/README.md b/lightrag/README.md index 4321213a..d2d676d0 100644 --- a/lightrag/README.md +++ b/lightrag/README.md @@ -1,66 +1,108 @@ - +```bash +pip install lightrag +``` + +Please refer to the [full installation guide](https://lightrag.sylph.ai/get_started/installation.html) for more details. + + + +You can place the above code in your project's root ``__init__.py`` file. +This setup ensures that LightRAG can access all necessary configurations during runtime. + +# Documentation + +LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sylph.ai/): + +- [Introduction](https://lightrag.sylph.ai/) +- [Full installation guide](https://lightrag.sylph.ai/get_started/installation.html) +- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html) +- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html) +- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html) +- [API reference](https://lightrag.sylph.ai/apis/index.html) + + + +## Contributors + +[![contributors](https://contrib.rocks/image?repo=SylphAI-Inc/LightRAG&max=2000)](https://github.com/SylphAI-Inc/LightRAG/graphs/contributors) + +# Citation + +```bibtex +@software{Yin-LightRAG-2024, + author = {Yin, Li}, + title = {{LightRAG: The PyTorch Library for Large language Model (LLM) Applications}}, + month = {7}, + year = {2024}, + url = {https://github.com/SylphAI-Inc/LightRAG} +} +``` From a5e0951a727bcf3f971952c87106e96f288c0c84 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Wed, 3 Jul 2024 13:10:46 -0700 Subject: [PATCH 18/20] test new workflow that will try to get the domain name right --- .github/workflows/documentation_li.yml | 4 +++ CNAME | 1 + README.md | 14 +++++----- .../developer_notes/class_hierarchy.rst | 2 +- .../lightrag_design_philosophy.rst | 26 +++++++++++-------- lightrag/README.md | 13 ++++------ lightrag/lightrag/__init__.py | 3 --- 7 files changed, 32 insertions(+), 31 deletions(-) create mode 100644 CNAME diff --git a/.github/workflows/documentation_li.yml b/.github/workflows/documentation_li.yml index 335a063c..47b4a957 100644 --- a/.github/workflows/documentation_li.yml +++ b/.github/workflows/documentation_li.yml @@ -51,6 +51,10 @@ jobs: touch .nojekyll working-directory: ${{ github.workspace }}/docs/build + - name: Copy CNAME file + run: | + cp ${{ github.workspace }}/CNAME ${{ github.workspace }}/docs/build/CNAME + - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 with: diff --git a/CNAME b/CNAME new file mode 100644 index 00000000..3088cf8c --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +lightrag.sylph.ai diff --git a/README.md b/README.md index 7a7b5d3f..706b3e77 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![LightRAG Logo](docs/source/_static/images/LightRAG-logo-doc.jpeg) -⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡ +## ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡ -We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. +*LightRAG* helps developers with both building and optimizing *Retriever-Agent-Generator (RAG)* pipelines. It is *light*, *modular*, and *robust*. @@ -38,7 +38,6 @@ class Net(nn.Module): from lightrag.core import Component, Generator from lightrag.components.model_client import GroqAPIClient -from lightrag.utils import setup_env #noqa class SimpleQA(Component): def __init__(self): @@ -74,8 +73,6 @@ Please refer to the [full installation guide](https://lightrag.sylph.ai/get_star -You can place the above code in your project's root ``__init__.py`` file. -This setup ensures that LightRAG can access all necessary configurations during runtime. # Documentation @@ -83,13 +80,14 @@ LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sy - [Introduction](https://lightrag.sylph.ai/) - [Full installation guide](https://lightrag.sylph.ai/get_started/installation.html) -- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html) -- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html) -- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html) +- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html): Design based on three principles: Simplicity over complexity, Quality over quantity, and Optimizing over building. +- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html): We have no more than two levels of subclasses. The bare minimum abstraction will developers with maximum customizability and simplicity. +- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html): Learn the `why` and `how-to` (customize and integrate) behind each core part within the `LightRAG` library. - [API reference](https://lightrag.sylph.ai/apis/index.html) + ## Contributors [![contributors](https://contrib.rocks/image?repo=SylphAI-Inc/LightRAG&max=2000)](https://github.com/SylphAI-Inc/LightRAG/graphs/contributors) diff --git a/docs/source/developer_notes/class_hierarchy.rst b/docs/source/developer_notes/class_hierarchy.rst index 074aad79..931d6e48 100644 --- a/docs/source/developer_notes/class_hierarchy.rst +++ b/docs/source/developer_notes/class_hierarchy.rst @@ -1,6 +1,6 @@ Class Hierarchy ============================= -From the plot of the `LightRAG` library's class hierarchy, we can see the library is well-centered around two base classes: `Component` and `DataClass`, and it has no more than two levels of subclasses. +From the plot of the `LightRAG` library's class hierarchy, we can see the library is well-centered around two base classes: `Component` and `DataClass`, and it maintains a class inheritance hierarchy with no more than two levels. This design philosophy results in a library with bare minimum abstraction, providing developers with maximum customizability. .. raw:: html diff --git a/docs/source/developer_notes/lightrag_design_philosophy.rst b/docs/source/developer_notes/lightrag_design_philosophy.rst index 462fd228..f2fb932c 100644 --- a/docs/source/developer_notes/lightrag_design_philosophy.rst +++ b/docs/source/developer_notes/lightrag_design_philosophy.rst @@ -4,7 +4,17 @@ Design Philosophy Right from the begining, `LightRAG` follows three fundamental principles. -Principle 1: Quality over Quantity +Principle 1: Simplicity over Complexity +----------------------------------------------------------------------- +We put these three hard rules while designing LightRAG: + +- Every layer of abstraction needs to be adjusted and overall we do not allow more than 3 layers of abstraction. +- We minimize the lines of code instead of maximizing the lines of code. +- Go *deep* and *wide* in order to *simplify*. The clarity we achieve is not the result of being easy, but the result of being deep. + + + +Principle 2: Quality over Quantity ----------------------------------------------------------------------- The Quality of core building blocks over the Quantity of integrations. @@ -14,7 +24,7 @@ This goes for the prompt, the model client, the retriever, the optimizer, and th -Principle 2: Optimizing over Building +Principle 3: Optimizing over Building ----------------------------------------------------------------------- We help users build the task pipeline, but we want to help with optimizing even more so. @@ -28,20 +38,14 @@ to ease the existing frustrations of optimizing the task pipeline. -Principle 3: Practicality over Showmanship ------------------------------------------------------------------------ -We put these three hard rules while designing LightRAG: - -- Every layer of abstraction needs to be adjusted and overall we do not allow more than 3 layers of abstraction. -- We minimize the lines of code instead of maximizing the lines of code. -- Go `deep` and `wide` in order to `simplify`. The clarity we achieve is not the result of being easy, but the result of being deep. -Our deep understanding of LLM workflow +Our understanding of LLM workflow ----------------------------------------------------------------------- -The above principles are distilled from our deep understanding of the LLM workflow. +The above principles are distilled from our experiences and continuous learning about the LLM workflow. + **Developers are the ultimate heroes** diff --git a/lightrag/README.md b/lightrag/README.md index d2d676d0..7ecd6c78 100644 --- a/lightrag/README.md +++ b/lightrag/README.md @@ -1,9 +1,9 @@ ![LightRAG Logo](../docs/source/_static/images/LightRAG-logo-doc.jpeg) -⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡ +## ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡ -We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. +*LightRAG* helps developers with both building and optimizing *Retriever-Agent-Generator (RAG)* pipelines. It is *light*, *modular*, and *robust*. @@ -75,18 +75,15 @@ Please refer to the [full installation guide](https://lightrag.sylph.ai/get_star -You can place the above code in your project's root ``__init__.py`` file. -This setup ensures that LightRAG can access all necessary configurations during runtime. - # Documentation LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sylph.ai/): - [Introduction](https://lightrag.sylph.ai/) - [Full installation guide](https://lightrag.sylph.ai/get_started/installation.html) -- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html) -- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html) -- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html) +- [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html): Design based on three principles: Simplicity over complexity, Quality over quantity, and Optimizing over building. +- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html): We have no more than two levels of subclasses. The bare minimum abstraction will developers with maximum customizability and simplicity. +- [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html): Learn the `why` and `how-to` (customize and integrate) behind each core part within the `LightRAG` library. - [API reference](https://lightrag.sylph.ai/apis/index.html) diff --git a/lightrag/lightrag/__init__.py b/lightrag/lightrag/__init__.py index d33bab7c..e69de29b 100644 --- a/lightrag/lightrag/__init__.py +++ b/lightrag/lightrag/__init__.py @@ -1,3 +0,0 @@ -from lightrag.utils import setup_env - -setup_env() From 3b6eec1bf1fe4d3d9592cf6fbe587cd8099522e9 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Wed, 3 Jul 2024 13:16:27 -0700 Subject: [PATCH 19/20] fix the delete of domain name --- .github/workflows/documentation.yml | 20 ++++++++++++-------- lightrag/pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 1ea72ee8..87f8f430 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -3,7 +3,7 @@ name: Documentation on: push: branches: - - release # Trigger the workflow when changes are pushed to the release branch + - release # Trigger the workflow when changes are pushed to the release branch permissions: contents: write @@ -17,12 +17,12 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - fetch-depth: 0 # Fetch all history for all branches and tags + fetch-depth: 0 # Fetch all history for all branches and tags - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.11' # Ensure the Python version is correct + python-version: '3.11' # Ensure the Python version is correct - name: Install Poetry run: | @@ -51,14 +51,18 @@ jobs: touch .nojekyll # Prevent GitHub Pages from ignoring files that start with an underscore working-directory: ${{ github.workspace }}/docs/build + - name: Copy CNAME file + run: | + cp ${{ github.workspace }}/CNAME ${{ github.workspace }}/docs/build/CNAME + - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 with: - github_token: ${{ secrets.GITHUB_TOKEN }} # GitHub token for authentication - publish_branch: gh-pages # Target branch for GitHub Pages deployment - publish_dir: ./docs/build/ # Directory containing the built documentation - user_name: github-actions[bot] # Username for the commit - user_email: github-actions[bot]@users.noreply.github.com # Email for the commit + github_token: ${{ secrets.GITHUB_TOKEN }} # GitHub token for authentication + publish_branch: gh-pages # Target branch for GitHub Pages deployment + publish_dir: ./docs/build/ # Directory containing the built documentation + user_name: github-actions[bot] # Username for the commit + user_email: github-actions[bot]@users.noreply.github.com # Email for the commit # Uncomment below for debugging purposes # - name: Debug Output diff --git a/lightrag/pyproject.toml b/lightrag/pyproject.toml index 4665bc46..81e530b1 100644 --- a/lightrag/pyproject.toml +++ b/lightrag/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "lightrag" -version = "0.0.0-alpha.8" +version = "0.0.0-alpha.9" description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator." authors = ["Li Yin "] readme = "README.md" From e407ff283e0fea5d438914246214a506dfa64283 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Wed, 3 Jul 2024 13:40:21 -0700 Subject: [PATCH 20/20] use url for logo --- README.md | 2 +- lightrag/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 706b3e77..ff6ba48c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![LightRAG Logo](docs/source/_static/images/LightRAG-logo-doc.jpeg) +![LightRAG Logo](https://raw.githubusercontent.com/SylphAI-Inc/LightRAG/main/docs/source/_static/images/LightRAG-logo-doc.jpeg) ## ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡ diff --git a/lightrag/README.md b/lightrag/README.md index 7ecd6c78..67e3bc99 100644 --- a/lightrag/README.md +++ b/lightrag/README.md @@ -1,4 +1,4 @@ -![LightRAG Logo](../docs/source/_static/images/LightRAG-logo-doc.jpeg) +![LightRAG Logo](https://raw.githubusercontent.com/SylphAI-Inc/LightRAG/main/docs/source/_static/images/LightRAG-logo-doc.jpeg) ## ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡