adapt the rag to the new code

SylphAI-Inc · May 21, 2024 · 32f6b8e · 32f6b8e
1 parent 8c3b6ae
commit 32f6b8e
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 20 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,3 @@
+Benchmarking is an integral development part of the project. 
+
+Contributors are encouraged to write benchmarks for their code, besides of the unit tests in `tests/` directory.
diff --git a/core/db.py b/core/db.py
@@ -10,7 +10,7 @@
 from core.component import Component
 from core.data_classes import Document
 
-from core.functional import generate_component_key, generate_readable_key_for_function
+from core.functional import generate_readable_key_for_function
 
 """
 Why do we need a localDocumentDB as the product db is always in the cloud?
@@ -73,7 +73,7 @@ def transform_data(
     ) -> List[Document]:
         """Transform the documents using the transformer, the transformed documents will be used to build index."""
         if key is None:
-            key = generate_component_key(transformer)
+            key = transformer._get_name() + "_transformed"
         documents_to_use = documents.copy() if documents else self.documents.copy()
         self.transformed_documents[key] = transformer(documents_to_use)
         return key

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -118,7 +118,7 @@ You have a similar coding experience as PyTorch. Here is a side to side comparis
 We encourage all users to at least skim through the developer documentation. Different from "PyTorch" where a normal user does not have to customize a building module for neural network, 
 LLM applications have much bigger scope and varies even more to different product environments, so developers customizing components on their own is much more common.
 
-Developer documentation
+Dive deep into the design of the libraries
 =======================
 
 .. toctree::
@@ -156,6 +156,11 @@ Developer documentation
    apis/prompts/prompts
    apis/eval/eval
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Benchmarks
+   .. Manually add documents for the code in benchmarks
+
 
 .. toctree::
    :glob:
@@ -166,7 +171,7 @@ Developer documentation
    resources/contributing
 
 
-User documentation
+Use the library
 =======================
 
 .. toctree::

diff --git a/prompts/outputs.py b/prompts/outputs.py
@@ -1,7 +1,7 @@
 """The most commonly used output parsers for the Generator.
 
 Note: Even with OutputParser for output_format_str formatting and the response parsing, it is not 100% guaranteed 
-as user query can impact the output.
+as user query can impact the output. Test your code well!
 """
 
 from dataclasses import is_dataclass

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,20 +1,21 @@
 [tool.poetry]
 name = "lightrag"
 
+packages = [
+    { include = "core", from = "." },
+    { include = "components", from = "." },
+    { include = "prompts", from = "." },
+    { include = "eval", from = "." },
+]
 version = "0.1.0"
-description = "1000 lines of code is all you need. Be light and be lightening fast."
+description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator."
 authors = ["Li Yin <[email protected]>"]
 readme = "README.md"
 license = "MIT"
 classifiers = [
     "Topic :: Software Development :: Build Tools",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-packages = [
-    { include = "core", from = "." },
-    { include = "components", from = "." },
-    # { include = "use_cases", from = "." },
-]
 
 [tool.poetry.dependencies]
 python = ">=3.11, <4.0"

diff --git a/use_cases/rag.py b/use_cases/rag.py
@@ -16,7 +16,11 @@
 from core.component import Component, Sequential
 from core.db import LocalDocumentDB
 
-from core.functional import generate_component_key
+import os
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+
+# from core.functional import generate_component_key
 
 from components.api_client import OpenAIClient
 from components.retriever import FAISSRetriever
@@ -36,7 +40,7 @@ def __init__(self, settings: dict):
         self.text_splitter_settings = settings["text_splitter"]
 
         vectorizer = Embedder(
-            model_client=OpenAIClient(),
+            model_client=OpenAIClient,
             # batch_size=self.vectorizer_settings["batch_size"],
             model_kwargs=self.vectorizer_settings["model_kwargs"],
             output_processors=ToEmbedderResponse(),
@@ -54,7 +58,7 @@ def __init__(self, settings: dict):
                 batch_size=self.vectorizer_settings["batch_size"],
             ),
         )
-        self.data_transformer_key = generate_component_key(self.data_transformer)
+        self.data_transformer_key = self.data_transformer._get_name()
         # initialize retriever, which depends on the vectorizer too
         self.retriever = FAISSRetriever(
             top_k=self.retriever_settings["top_k"],
@@ -83,7 +87,7 @@ def __init__(self, settings: dict):
     "answer": "The answer to the query",
 }"""
             },
-            model_client=OpenAIClient(),
+            model_client=OpenAIClient,
             model_kwargs=self.generator_model_kwargs,
             output_processors=JsonParser(),
         )

diff --git a/use_cases/rag_hotpotqa.py b/use_cases/rag_hotpotqa.py
@@ -42,14 +42,18 @@ def get_supporting_sentences(
 
 if __name__ == "__main__":
     # NOTE: for the ouput of this following code, check text_lightrag.txt
-    with open("./configs/rag_hotpotqa.yaml", "r") as file:
+    with open("./use_cases/configs/rag_hotpotqa.yaml", "r") as file:
         settings = yaml.safe_load(file)
     print(settings)
 
     # Load the dataset and select the first 5 as the showcase
+    # 300 M.
     # More info about the HotpotQA dataset can be found at https://huggingface.co/datasets/hotpot_qa
     dataset = load_dataset(path="hotpot_qa", name="fullwiki")
-    dataset = dataset["train"].select(range(5))
+    print(f"len of eval: {len(dataset['test'])}")
+    print(f"example: {dataset['test'][1]}")
+    # exit()
+    dataset = dataset["train"].select(range(1))
 
     all_questions = []
     all_retrieved_context = []
@@ -117,9 +121,9 @@ def get_supporting_sentences(
     # Evaluate the generator using LLM as judge. We use GPT-4 as the judge here.
     # The task description and the judgement query can be customized.
     llm_evaluator = Generator(
-        model_client=OpenAIClient(),
-        prompt=Prompt(DEFAULT_LLM_EVALUATOR_PROMPT),
-        output_processors=Sequential(JsonParser()),
+        model_client=OpenAIClient,
+        template=DEFAULT_LLM_EVALUATOR_PROMPT,
+        output_processors=JsonParser(),
         preset_prompt_kwargs={
             "task_desc_str": r"""
                 You are a helpful assistant.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Benchmarking is an integral development part of the project.

		Contributors are encouraged to write benchmarks for their code, besides of the unit tests in `tests/` directory.