From 1a6f349a4bc0510ce2f6246d1926876fbf5ff6da Mon Sep 17 00:00:00 2001
From: Li Yin
Date: Sun, 14 Jul 2024 12:03:55 -0700
Subject: [PATCH] add a call and address mypy error for sequential
---
.github/workflows/documentation.yml | 2 +-
docs/source/tutorials/generator.rst | 7 +--
docs/source/tutorials/retriever.rst | 66 +++++++++++++++--------------
lightrag/CHANGELOG.md | 3 ++
lightrag/lightrag/core/container.py | 58 +++++++++++++++++++------
5 files changed, 88 insertions(+), 48 deletions(-)
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index aecce85b..fd2d6971 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -3,7 +3,7 @@ name: Documentation
on:
push:
branches:
- - li # Trigger the workflow when changes are pushed to the release branch
+ - release # Trigger the workflow when changes are pushed to the release branch
permissions:
contents: write
diff --git a/docs/source/tutorials/generator.rst b/docs/source/tutorials/generator.rst
index 60b6bdc9..369807b2 100644
--- a/docs/source/tutorials/generator.rst
+++ b/docs/source/tutorials/generator.rst
@@ -12,7 +12,7 @@ Generator
`Generator` is a user-facing orchestration component with a simple and unified interface for LLM prediction.
-It is a pipeline consisting of three subcomponents.
+It is a pipeline consisting of three subcomponents. By switching the prompt template, model client, and output parser, users have full control and flexibility.
Design
---------------------------------------
@@ -26,11 +26,10 @@ Design
-
The :class:`Generator` is designed to achieve the following goals:
1. Model Agnostic: The Generator should be able to call any LLM model with the same prompt.
-2. Unified Interface: It should manage the pipeline from prompt(input)->model call -> output parsing.
+2. Unified interface: It manages the pipeline from prompt (input) -> model call -> output parsing, while still giving users full control over each part.
3. Unified Output: This will make it easy to log and save records of all LLM predictions.
4. Work with Optimizer: It should be able to work with Optimizer to optimize the prompt.
@@ -443,6 +442,7 @@ Besides these examples, LLM is like water, even in our library, we have componen
- :class:`LLMRetriever` is a retriever that uses Generator to call LLM to retrieve the most relevant documents.
- :class:`DefaultLLMJudge` is a judge that uses Generator to call LLM to evaluate the quality of the response.
- :class:`LLMOptimizer` is an optimizer that uses Generator to call LLM to optimize the prompt.
+- :class:`ReAct Agent Planner` is an LLM planner that uses Generator to plan and to call functions in ReAct Agent.
Tracing
---------------------------------------
@@ -479,6 +479,7 @@ Coming soon!
- :class:`tracing.generator_call_logger.GeneratorCallLogger`
- :class:`tracing.generator_state_logger.GeneratorStateLogger`
- :class:`components.retriever.llm_retriever.LLMRetriever`
+ - :class:`components.agent.react.ReActAgent`
- :class:`eval.llm_as_judge.DefaultLLMJudge`
- :class:`optim.llm_optimizer.LLMOptimizer`
- :func:`utils.config.new_component`
diff --git a/docs/source/tutorials/retriever.rst b/docs/source/tutorials/retriever.rst
index d403f989..4b8d1bae 100644
--- a/docs/source/tutorials/retriever.rst
+++ b/docs/source/tutorials/retriever.rst
@@ -120,9 +120,14 @@ Working with ``DialogTurn`` can help manage ``conversation_history``, especiall
Retriever Data Types
-^^^^^^^^^^^^^^^^^^^^^^^^
-In most cases, the query is string. But there are cases we might need both text and images as a query, such as "find me a cloth that looks like this".
-We defined the query type as:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Query**
+
+In most cases, the query is string. But there are cases where we might need both text and images as a query, such as "find me a cloth that looks like this".
+We defined the query type `RetrieverQueriesType` so that all of our retrievers should handle both single query and multiple queries at once.
+For text-based retrievers, we defined `RetrieverStrQueriesType` as a string or a sequence of strings.
+
.. code-block:: python
@@ -131,28 +136,29 @@ We defined the query type as:
RetrieverQueriesType = Union[RetrieverQueryType, Sequence[RetrieverQueryType]]
RetrieverStrQueriesType = Union[str, Sequence[RetrieverStrQueryType]]
-As we see, our retriever should be able to handle both single query and multiple queries at once.
+**Documents**
-The documents are a sequence of document of any type that will be later specified by the subclass:
+The documents are a sequence of documents of any type, which will be later specified by the subclass:
.. code-block:: python
RetrieverDocumentType = TypeVar("RetrieverDocumentType", contravariant=True) # a single document
RetrieverDocumentsType = Sequence[RetrieverDocumentType] # The final documents types retriever can use
+**Output**
-We further define the same output format so that we can easily switch between different retrievers in our task pipeline.
-Here is our output format:
+We further definied the unified output data structure :class:`RetrieverOutput` so that we can easily switch between different retrievers in our task pipeline.
+A retriever should return a list of `RetrieverOutput` to support multiple queries at once. This is helpful for:
+(1) Batch-processing: Especially for semantic search, where multiple queries can be represented as numpy array and computed all at once, providing faster speeds than processing each query one by one.
+(2) Query expansion: To increase recall, users often generate multiple queries from the original query.
-.. code-block:: python
- class RetrieverOutput(DataClass):
- __doc__ = r"""Save the output of a single query in retrievers.
- It is up to the subclass of Retriever to specify the type of query and document.
- """
+.. code-block:: python
+ @dataclass
+ class RetrieverOutput(DataClass):
doc_indices: List[int] = field(metadata={"desc": "List of document indices"})
doc_scores: Optional[List[float]] = field(
default=None, metadata={"desc": "List of document scores"}
@@ -167,11 +173,24 @@ Here is our output format:
RetrieverOutputType = List[RetrieverOutput] # so to support multiple queries at once
-You can find the types in :ref:`types`. The list of queries and `RetrieverOutput` can be helpful for:
-(1) Batch-processing: especially for semantic search where multiple queries can be represented as numpy array and be computed all at once with faster speed than doing one by one.
-(2) For `query expansion` where to increase the recall, users often generate multiple queries from the original query.
+**Document and TextSplitter**
+
+If your documents (in text format) are too large, it is common practise to first use :class:`TextSplitter` to split the text into smaller chunks.
+Please refer to the :doc:`text_splitter` tutorial on how to use it.
+
+
+
+Retriever Base Class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Functionally, the base retriever :class:`Retriever` defines another required method ``build_index_from_documents`` where the subclass will prepare the retriever for the actual retrieval calls.
+Optionally, the subclass can implement ``save_to_file`` and ``load_from_file`` to save and load the retriever to/from disk.
+As the retriever is a subclass of component, you already inherited powerful serialization and deserialization methods such as ``to_dict``, ``from_dict``, and ``from_config`` to help
+with the saving and loading process. As for helper attributes, we have ``indexed`` and ``index_keys`` to differentiate if the retriever is ready for retrieval and the attributes that are key to restore the functionality/states of the retriever.
+It is up the subclass to decide how to decide the storage of the index, it can be in-memory, local disk, or cloud storage, or save as json or pickle file or even a db table.
+As an example, :class:`BM25Retriever` has the following key attributes to index.
.. code-block:: python
@@ -196,24 +215,9 @@ You can find the types in :ref:`types`. The list of queries and `Ret
raise NotImplementedError(f"Async retrieve is not implemented")
-**Document and TextSplitter**
-
-If your documents(text format) are too large and it is a common practise to first use ``TextSplitter`` to split them into smaller chunks.
-Please refer to :doc:`text_splitter` and our provided notebook on how to use it.
-
-
-
-Retriever Base Class
-^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
-Functionally, the base retriever :class:`Retriever` defines another required method ``build_index_from_documents`` where the subclass will prepare the retriever for the actual retrieval calls.
-Optionally, the subclass can implement ``save_to_file`` and ``load_from_file`` to save and load the retriever to/from disk.
-As the retriever is a subclass of component, you already inherited powerful serialization and deserialization methods such as ``to_dict``, ``from_dict``, and ``from_config`` to help
-with the saving and loading process. As for helper attributes, we have ``indexed`` and ``index_keys`` to differentiate if the retriever is ready for retrieval and the attributes that are key to restore the functionality/states of the retriever.
-It is up the subclass to decide how to decide the storage of the index, it can be in-memory, local disk, or cloud storage, or save as json or pickle file or even a db table.
-As an example, :class:`BM25Retriever` has the following key attributes to index.
-.. code:: python
self.index_keys = ["nd", "t2d", "idf","doc_len","avgdl","total_documents","top_k","k1","b","epsilon","indexed"]
diff --git a/lightrag/CHANGELOG.md b/lightrag/CHANGELOG.md
index fd876f53..00076909 100644
--- a/lightrag/CHANGELOG.md
+++ b/lightrag/CHANGELOG.md
@@ -1,3 +1,6 @@
+### Added
+- `Sequential` adds `acall` method.
+
## [0.0.0-beta.1] - 2024-07-10
### Added
diff --git a/lightrag/lightrag/core/container.py b/lightrag/lightrag/core/container.py
index 7170e1b6..6f7c14e5 100644
--- a/lightrag/lightrag/core/container.py
+++ b/lightrag/lightrag/core/container.py
@@ -13,7 +13,7 @@
class Sequential(Component):
__doc__ = r"""A sequential container.
- Follows the same design pattern as PyTorch's ``nn.Sequential``.
+ Adapted from PyTorch's ``nn.Sequential``.
Components will be added to it in the order they are passed to the constructor.
Alternatively, an ``OrderedDict`` of components can be passed in.
@@ -97,7 +97,7 @@ def call(self, input: int) -> int:
>>> result = seq.call(2, 3)
"""
- _components: Dict[str, Component] # = OrderedDict()
+ _components: Dict[str, Component] = OrderedDict() # type: ignore[assignment]
@overload
def __init__(self, *args: Component) -> None: ...
@@ -114,7 +114,7 @@ def __init__(self, *args):
for idx, component in enumerate(args):
self.add_component(str(idx), component)
- def _get_item_by_idx(self, iterator: Iterator[T], idx: int) -> T:
+ def _get_item_by_idx(self, iterator: Iterator[Component], idx: int) -> Component:
"""Get the idx-th item of the iterator."""
size = len(self)
idx = operator.index(idx)
@@ -132,15 +132,18 @@ def __getitem__(
elif isinstance(idx, str):
return self._components[idx]
else:
- return self._get_item_by_idx(self._components.values(), idx)
+ return self._get_item_by_idx(iter(self._components.values()), idx)
def __setitem__(self, idx: Union[int, str], component: Component) -> None:
"""Set the idx-th component of the Sequential."""
if isinstance(idx, str):
self._components[idx] = component
else:
- key: str = self._get_item_by_idx(self._components.keys(), idx)
- return setattr(self, key, component)
+ # key: str = self._get_item_by_idx(iter(self._components.keys()), idx)
+ # self._components[key] = component
+ key_list = list(self._components.keys())
+ key = key_list[idx]
+ self._components[key] = component
def __delitem__(self, idx: Union[slice, int, str]) -> None:
"""Delete the idx-th component of the Sequential."""
@@ -150,15 +153,18 @@ def __delitem__(self, idx: Union[slice, int, str]) -> None:
elif isinstance(idx, str):
del self._components[idx]
else:
- key = self._get_item_by_idx(self._components.keys(), idx)
+ # key = self._get_item_by_idx(iter(self._components.keys()), idx)
+ key_list = list(self._components.keys())
+ key = key_list[idx]
+
delattr(self, key)
- # To preserve numbering
- str_indices = [str(i) for i in range(len(self._components))]
+
+ # Reordering is needed if numerical keys are used to keep the sequence
self._components = OrderedDict(
- list(zip(str_indices, self._components.values()))
+ (str(i), comp) for i, comp in enumerate(self._components.values())
)
- def __iter__(self) -> Iterable[Component]:
+ def __iter__(self) -> Iterator[Component]:
r"""Iterates over the components of the Sequential.
Examples:
@@ -250,6 +256,33 @@ def call(self, *args: Any, **kwargs: Any) -> object:
kwargs = {}
return args[0] if len(args) == 1 else (args, kwargs)
+ @overload
+ async def acall(self, input: Any) -> object: ...
+
+ @overload
+ async def acall(self, *args: Any, **kwargs: Any) -> object: ...
+
+ async def acall(self, *args: Any, **kwargs: Any) -> object:
+ r"""When you for loop or multiple await calls inside each component, use acall method can potentially speed up the execution."""
+ if len(args) == 1 and not kwargs:
+ input = args[0]
+ for component in self._components.values():
+ input = await component(input)
+ return input
+ else:
+ for component in self._components.values():
+ result = await component(*args, **kwargs)
+ if (
+ isinstance(result, tuple)
+ and len(result) == 2
+ and isinstance(result[1], dict)
+ ):
+ args, kwargs = result
+ else:
+ args = (result,)
+ kwargs = {}
+ return args[0] if len(args) == 1 else (args, kwargs)
+
def append(self, component: Component) -> "Sequential":
r"""Appends a component to the end of the Sequential."""
idx = len(self._components)
@@ -259,7 +292,7 @@ def append(self, component: Component) -> "Sequential":
def insert(self, idx: int, component: Component) -> None:
r"""Inserts a component at a given index in the Sequential."""
if not isinstance(component, Component):
- raise AssertionError(
+ raise TypeError(
f"component should be an instance of Component, but got {type(component)}"
)
n = len(self._components)
@@ -272,7 +305,6 @@ def insert(self, idx: int, component: Component) -> None:
for i in range(n, idx, -1):
self._components[str(i)] = self._components[str(i - 1)]
self._components[str(idx)] = component
- return self
def extend(self, components: Iterable[Component]) -> "Sequential":
r"""Extends the Sequential with components from an iterable."""