Skip to content

Commit

Permalink
adapt the rag to the new code
Browse files Browse the repository at this point in the history
  • Loading branch information
liyin2015 committed May 21, 2024
1 parent 8c3b6ae commit 32f6b8e
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 20 deletions.
3 changes: 3 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Benchmarking is an integral development part of the project.

Contributors are encouraged to write benchmarks for their code, besides of the unit tests in `tests/` directory.
4 changes: 2 additions & 2 deletions core/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from core.component import Component
from core.data_classes import Document

from core.functional import generate_component_key, generate_readable_key_for_function
from core.functional import generate_readable_key_for_function

"""
Why do we need a localDocumentDB as the product db is always in the cloud?
Expand Down Expand Up @@ -73,7 +73,7 @@ def transform_data(
) -> List[Document]:
"""Transform the documents using the transformer, the transformed documents will be used to build index."""
if key is None:
key = generate_component_key(transformer)
key = transformer._get_name() + "_transformed"
documents_to_use = documents.copy() if documents else self.documents.copy()
self.transformed_documents[key] = transformer(documents_to_use)
return key
Expand Down
9 changes: 7 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ You have a similar coding experience as PyTorch. Here is a side to side comparis
We encourage all users to at least skim through the developer documentation. Different from "PyTorch" where a normal user does not have to customize a building module for neural network,
LLM applications have much bigger scope and varies even more to different product environments, so developers customizing components on their own is much more common.

Developer documentation
Dive deep into the design of the libraries
=======================

.. toctree::
Expand Down Expand Up @@ -156,6 +156,11 @@ Developer documentation
apis/prompts/prompts
apis/eval/eval

.. toctree::
:maxdepth: 1
:caption: Benchmarks
.. Manually add documents for the code in benchmarks


.. toctree::
:glob:
Expand All @@ -166,7 +171,7 @@ Developer documentation
resources/contributing


User documentation
Use the library
=======================

.. toctree::
Expand Down
2 changes: 1 addition & 1 deletion prompts/outputs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The most commonly used output parsers for the Generator.
Note: Even with OutputParser for output_format_str formatting and the response parsing, it is not 100% guaranteed
as user query can impact the output.
as user query can impact the output. Test your code well!
"""

from dataclasses import is_dataclass
Expand Down
13 changes: 7 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
[tool.poetry]
name = "lightrag"

packages = [
{ include = "core", from = "." },
{ include = "components", from = "." },
{ include = "prompts", from = "." },
{ include = "eval", from = "." },
]
version = "0.1.0"
description = "1000 lines of code is all you need. Be light and be lightening fast."
description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator."
authors = ["Li Yin <[email protected]>"]
readme = "README.md"
license = "MIT"
classifiers = [
"Topic :: Software Development :: Build Tools",
"Topic :: Software Development :: Libraries :: Python Modules",
]
packages = [
{ include = "core", from = "." },
{ include = "components", from = "." },
# { include = "use_cases", from = "." },
]

[tool.poetry.dependencies]
python = ">=3.11, <4.0"
Expand Down
12 changes: 8 additions & 4 deletions use_cases/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from core.component import Component, Sequential
from core.db import LocalDocumentDB

from core.functional import generate_component_key
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# from core.functional import generate_component_key

from components.api_client import OpenAIClient
from components.retriever import FAISSRetriever
Expand All @@ -36,7 +40,7 @@ def __init__(self, settings: dict):
self.text_splitter_settings = settings["text_splitter"]

vectorizer = Embedder(
model_client=OpenAIClient(),
model_client=OpenAIClient,
# batch_size=self.vectorizer_settings["batch_size"],
model_kwargs=self.vectorizer_settings["model_kwargs"],
output_processors=ToEmbedderResponse(),
Expand All @@ -54,7 +58,7 @@ def __init__(self, settings: dict):
batch_size=self.vectorizer_settings["batch_size"],
),
)
self.data_transformer_key = generate_component_key(self.data_transformer)
self.data_transformer_key = self.data_transformer._get_name()
# initialize retriever, which depends on the vectorizer too
self.retriever = FAISSRetriever(
top_k=self.retriever_settings["top_k"],
Expand Down Expand Up @@ -83,7 +87,7 @@ def __init__(self, settings: dict):
"answer": "The answer to the query",
}"""
},
model_client=OpenAIClient(),
model_client=OpenAIClient,
model_kwargs=self.generator_model_kwargs,
output_processors=JsonParser(),
)
Expand Down
14 changes: 9 additions & 5 deletions use_cases/rag_hotpotqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,18 @@ def get_supporting_sentences(

if __name__ == "__main__":
# NOTE: for the ouput of this following code, check text_lightrag.txt
with open("./configs/rag_hotpotqa.yaml", "r") as file:
with open("./use_cases/configs/rag_hotpotqa.yaml", "r") as file:
settings = yaml.safe_load(file)
print(settings)

# Load the dataset and select the first 5 as the showcase
# 300 M.
# More info about the HotpotQA dataset can be found at https://huggingface.co/datasets/hotpot_qa
dataset = load_dataset(path="hotpot_qa", name="fullwiki")
dataset = dataset["train"].select(range(5))
print(f"len of eval: {len(dataset['test'])}")
print(f"example: {dataset['test'][1]}")
# exit()
dataset = dataset["train"].select(range(1))

all_questions = []
all_retrieved_context = []
Expand Down Expand Up @@ -117,9 +121,9 @@ def get_supporting_sentences(
# Evaluate the generator using LLM as judge. We use GPT-4 as the judge here.
# The task description and the judgement query can be customized.
llm_evaluator = Generator(
model_client=OpenAIClient(),
prompt=Prompt(DEFAULT_LLM_EVALUATOR_PROMPT),
output_processors=Sequential(JsonParser()),
model_client=OpenAIClient,
template=DEFAULT_LLM_EVALUATOR_PROMPT,
output_processors=JsonParser(),
preset_prompt_kwargs={
"task_desc_str": r"""
You are a helpful assistant.
Expand Down

0 comments on commit 32f6b8e

Please sign in to comment.