diff --git a/.env_example b/.env_example index d20bbf73..a4a7aed0 100644 --- a/.env_example +++ b/.env_example @@ -1,2 +1,6 @@ OPENAI_API_KEY=YOUR_API_KEY_IF_YOU_USE_OPENAI GROQ_API_KEY=YOUR_API_KEY_IF_YOU_USE_GROQ +ANTHROPIC_API_KEY=YOUR_API_KEY_IF_YOU_USE_ANTHROPIC +GOOGLE_API_KEY=YOUR_API_KEY_IF_YOU_USE_GOOGLE +COHERE_API_KEY=YOUR_API_KEY_IF_YOU_USE_COHERE +HF_TOKEN=YOUR_API_KEY_IF_YOU_USE_HF diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 00000000..6d251f96 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,68 @@ +name: Documentation + +on: + push: + branches: + - xiaoyi_doc # Ensure this is the branch where you commit documentation updates + +permissions: + contents: write + actions: read + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install dependencies using Poetry + run: | + poetry config virtualenvs.create false + poetry install + + - name: Build documentation using Makefile + run: | + echo "Building documentation from: $(pwd)" + ls -l # Debug: List current directory contents + poetry run make -C docs html + working-directory: ${{ github.workspace }} + + - name: List built documentation + run: | + find ./build/ -type f + working-directory: ${{ github.workspace }}/docs + + - name: Create .nojekyll file + run: | + touch .nojekyll + working-directory: ${{ github.workspace }}/docs/build + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_branch: gh-pages + publish_dir: ./docs/build/ + user_name: github-actions[bot] + user_email: github-actions[bot]@users.noreply.github.com + + # - name: Debug Output + # run: | + # pwd # Print the current working directory + # ls -l # List files in the build directory + # cat ./source/conf.py # Show Sphinx config file for debugging + # working-directory: ${{ github.workspace }}/docs/build diff --git a/.github/workflows/documentation_li.yml b/.github/workflows/documentation_li.yml new file mode 100644 index 00000000..335a063c --- /dev/null +++ b/.github/workflows/documentation_li.yml @@ -0,0 +1,67 @@ +name: Documentation + +on: + push: + branches: + - li # Ensure this is the branch where you commit documentation updates + +permissions: + contents: write + actions: read + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install dependencies using Poetry + run: | + poetry config virtualenvs.create false + poetry install + + - name: Build documentation using Makefile + run: | + echo "Building documentation from: $(pwd)" + ls -l # Debug: List current directory contents + poetry run make -C docs html + working-directory: ${{ github.workspace }} + + - name: List built documentation + run: | + find ./build/ -type f + working-directory: ${{ github.workspace }}/docs + + - name: Create .nojekyll file + run: | + touch .nojekyll + working-directory: ${{ github.workspace }}/docs/build + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_branch: gh-pages + publish_dir: ./docs/build/ + user_name: github-actions[bot] + user_email: github-actions[bot]@users.noreply.github.com + # - name: Debug Output + # run: | + # pwd # Print the current working directory + # ls -l # List files in the build directory + # cat ./source/conf.py # Show Sphinx config file for debugging + # working-directory: ${{ github.workspace }}/docs/build diff --git a/.gitignore b/.gitignore index 2790df5f..c4ccbaf9 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ traces/ *.log storage/ *.pkl +/*.png +/*.dot +/*.svg diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89127885..691ad663 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,11 +12,16 @@ repos: rev: 24.4.2 hooks: - id: black - args: ["--line-length=88"] + args: ['--line-length=88'] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.4.2 hooks: # Run the linter. - id: ruff - args: ["--fix", "--extend-ignore=E402"] + args: ['--fix', '--extend-ignore=E402'] + # - repo: https://github.com/pycqa/flake8 + # rev: 4.0.1 + # hooks: + # - id: flake8 + # args: ['--max-line-length=88'] diff --git a/README.md b/README.md new file mode 100644 index 00000000..8c93d482 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# Introduction + +LightRAG is the `PyTorch` library for building large language model (LLM) applications. We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. +It is light, modular, and robust. + +**PyTorch** + +```python +import torch +import torch.nn as nn + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout2d(0.25) + self.dropout2 = nn.Dropout2d(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.dropout1(x) + x = self.dropout2(x) + x = self.fc1(x) + return self.fc2(x) +``` + +**LightRAG** + +```python + +from lightrag.core import Component, Generator +from lightrag.components.model_client import GroqAPIClient +from lightrag.utils import setup_env #noqa + +class SimpleQA(Component): + def __init__(self): + super().__init__() + template = r""" + You are a helpful assistant. + + User: {{input_str}} + You: + """ + self.generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + template=template, + ) + + def call(self, query): + return self.generator({"input_str": query}) + + async def acall(self, query): + return await self.generator.acall({"input_str": query}) +``` + +## Simplicity + +Developers who are building real-world Large Language Model (LLM) applications are the real heroes. +As a library, we provide them with the fundamental building blocks with 100% clarity and simplicity. + +* Two fundamental and powerful base classes: Component for the pipeline and DataClass for data interaction with LLMs. +* We end up with less than two levels of subclasses. Class Hierarchy Visualization. +* The result is a library with bare minimum abstraction, providing developers with maximum customizability. + +Similar to the PyTorch module, our Component provides excellent visualization of the pipeline structure. + +``` +SimpleQA( + (generator): Generator( + model_kwargs={'model': 'llama3-8b-8192'}, + (prompt): Prompt( + template: + You are a helpful assistant. + + User: {{input_str}} + You: + , prompt_variables: ['input_str'] + ) + (model_client): GroqAPIClient() + ) +) +``` + +## Controllability + +Our simplicity did not come from doing 'less'. +On the contrary, we have to do 'more' and go 'deeper' and 'wider' on any topic to offer developers maximum control and robustness. + +* LLMs are sensitive to the prompt. We allow developers full control over their prompts without relying on API features such as tools and JSON format with components like Prompt, OutputParser, FunctionTool, and ToolManager. +* Our goal is not to optimize for integration, but to provide a robust abstraction with representative examples. See this in ModelClient and Retriever. +* All integrations, such as different API SDKs, are formed as optional packages but all within the same library. You can easily switch to any models from different providers that we officially support. + +## Future of LLM Applications + +On top of the easiness to use, we in particular optimize the configurability of components for researchers to build their solutions and to benchmark existing solutions. +Like how PyTorch has united both researchers and production teams, it enables smooth transition from research to production. +With researchers building on LightRAG, production engineers can easily take over the method and test and iterate on their production data. +Researchers will want their code to be adapted into more products too. diff --git a/lightrag/__init__.py b/_lightrag/lightrag/__init__.py similarity index 100% rename from lightrag/__init__.py rename to _lightrag/lightrag/__init__.py diff --git a/benchmarks/ReAct_agent/fever/fever.py b/benchmarks/ReAct_agent/fever/fever.py index ca84d522..4564b5ee 100644 --- a/benchmarks/ReAct_agent/fever/fever.py +++ b/benchmarks/ReAct_agent/fever/fever.py @@ -9,26 +9,25 @@ import dotenv from components.api_client.openai_client import OpenAIClient from components.agent.react_agent import ReActAgent -from core.tool_helper import FunctionTool +from core.func_tool import FunctionTool from components.api_client import GroqAPIClient import time from benchmarks.ReAct_agent.utils.tools import search, lookup, normalize_answer from eval.evaluators import AnswerMacthEvaluator import logging import json -from typing import List, Union, Callable, Optional, Any, Dict -from core.tool_helper import FunctionTool, AsyncCallable +from typing import List, Optional, Any, Dict logger = logging.getLogger(__name__) -logging.basicConfig(filename='./logs/fever.log', level=logging.INFO) +logging.basicConfig(filename="./logs/fever.log", level=logging.INFO) # load evironment dotenv.load_dotenv(dotenv_path=".env", override=True) # Reference: paper's instruction prompt. (we use our default DEFAULT_REACT_AGENT_SYSTEM_PROMPT) -# instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: +# instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: # (1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. # (2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. # (3) Finish[answer], which returns the answer and finishes the task. @@ -41,7 +40,7 @@ FEVER_REACT_AGENT_SYSTEM_PROMPT = r""" {# role/task description #} -You task is to determine if there is Observation that SUPPORTS or REFUTES a Claim, or if there is NOT ENOUGH INFORMATION. +You task is to determine if there is Observation that SUPPORTS or REFUTES a Claim, or if there is NOT ENOUGH INFORMATION. Your can only answer SUPPORTS, REFUTES or NOT ENOUGH INFORMATION, and nothing else. {# REACT instructions #} Each step you will read the previous Thought, Action, and Observation(execution result of the action)steps and then provide the next Thought and Action. @@ -127,7 +126,7 @@ Observation 3: The song peaked at number two on the Billboard Hot 100 in the United States, where it was certified Gold for 500,000 units shipped. Thought 4: It only says the song peaked at number two on the Billboard Hot 100, but not if it was in 2003. I am not sure if this claim is true or not. Action 4: finish("NOT ENOUGH INFO") - """ + """, ] @@ -142,21 +141,29 @@ def config_agent(model_kwargs: Dict, examples: Optional[List[str]] = []) -> ReAc Returns: ReActAgent: the configured agent """ - - preset_prompt_kwargs = {'examples': examples} if len(examples) else {} - + + preset_prompt_kwargs = {"examples": examples} if len(examples) else {} + # set up tools - tools = [FunctionTool.from_defaults(fn=search), FunctionTool.from_defaults(fn=lookup)] - model_client = OpenAIClient if 'gpt' in model_kwargs.get('model', '') else GroqAPIClient - + tools = [ + FunctionTool.from_defaults(fn=search), + FunctionTool.from_defaults(fn=lookup), + ] + model_client = ( + OpenAIClient if "gpt" in model_kwargs.get("model", "") else GroqAPIClient + ) + return ReActAgent( - tools=tools, max_steps=7, model_client=model_client, - model_kwargs=model_kwargs, preset_prompt_kwargs=preset_prompt_kwargs, - template=FEVER_REACT_AGENT_SYSTEM_PROMPT + tools=tools, + max_steps=7, + model_client=model_client, + model_kwargs=model_kwargs, + preset_prompt_kwargs=preset_prompt_kwargs, + template=FEVER_REACT_AGENT_SYSTEM_PROMPT, ) -def run_query(agent: ReActAgent, question: str, gt_answer:str) -> Dict[str, float]: +def run_query(agent: ReActAgent, question: str, gt_answer: str) -> Dict[str, float]: """ Run queries and calculate the evaluation metrics """ @@ -164,19 +171,27 @@ def run_query(agent: ReActAgent, question: str, gt_answer:str) -> Dict[str, floa pred_answer = agent(question) pred_answer = normalize_answer(pred_answer) elapsed_time = time.time() - start_time - - logger.info(f"Question: {question}, \ngt_answer: {gt_answer}, \npred_answer: {pred_answer}\n") - - em = EM_evaluator.compute_match_acc_single_query(pred_answer=pred_answer, gt_answer=gt_answer) - fm = FM_evaluator.compute_match_acc_single_query(pred_answer=pred_answer, gt_answer=gt_answer) - - return { - "EM": em, - "FM": fm, - "time": elapsed_time - } - -def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: Dict, examples: Optional[List[str]] = []) -> Dict[str, float]: + + logger.info( + f"Question: {question}, \ngt_answer: {gt_answer}, \npred_answer: {pred_answer}\n" + ) + + em = EM_evaluator.compute_match_acc_single_query( + pred_answer=pred_answer, gt_answer=gt_answer + ) + fm = FM_evaluator.compute_match_acc_single_query( + pred_answer=pred_answer, gt_answer=gt_answer + ) + + return {"EM": em, "FM": fm, "time": elapsed_time} + + +def experiment( + num_questions: int, + dataset: List[Dict[str, Any]], + model_kwargs: Dict, + examples: Optional[List[str]] = [], +) -> Dict[str, float]: """ Perform react agent experiment, evaluation metrics are Exact Match and Fuzzy Match @@ -189,19 +204,19 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: Returns: Dict[str, float]: return the evaluations """ - + logger.info(f"model_kwargs: {model_kwargs}") - + # Initialize the agent once if configuration does not need to change each iteration react_agent = config_agent(model_kwargs=model_kwargs, examples=examples) total_metrics = {"N": 0, "EM": 0, "FM": 0, "time": 0} for i in range(num_questions): question = dataset[i]["claim"] gt_answer = normalize_answer(dataset[i]["label"]) - + result = run_query(react_agent, question, gt_answer) - total_metrics["N"] += 1 # number of questions + total_metrics["N"] += 1 # number of questions total_metrics["EM"] += result["EM"] total_metrics["FM"] += result["FM"] total_metrics["time"] += result["time"] @@ -213,54 +228,60 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: return average_metrics - # setup evaluators EM_evaluator = AnswerMacthEvaluator(type="exact_match") FM_evaluator = AnswerMacthEvaluator(type="fuzzy_match") # load test data -file = open('./tests/benchmark/ReAct_agent/paper_data/paper_dev_10.json') +file = open("./tests/benchmark/ReAct_agent/paper_data/paper_dev_10.json") dataset = json.load(file) # define the arguments, follow the paper's argument settings gpt_3_turbo_model_kwargs = { - "model": "gpt-3.5-turbo", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-3.5-turbo", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} gpt_4o_model_kwargs = { - "model": "gpt-4o", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-4o", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} gpt_4_turbo_model_kwargs = { - "model": "gpt-4-turbo-preview", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-4-turbo-preview", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} llama3_model_kwargs = { - "model": "llama3-70b-8192", # llama3 is not good with string formatting, llama3 8b is also bad at following instruction, 70b is better but still not as good as gpt-3.5-turbo - "temperature": 0.0, - } + "model": "llama3-70b-8192", # llama3 is not good with string formatting, llama3 8b is also bad at following instruction, 70b is better but still not as good as gpt-3.5-turbo + "temperature": 0.0, +} num_questions = 10 # gpt_3_5_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_3_turbo_model_kwargs) # gpt_3_5_3_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_3_turbo_model_kwargs, examples=examples) # gpt_3_5_6_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_3_turbo_model_kwargs, examples=examples) -gpt_4o_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4o_model_kwargs) -gpt_4o_3_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4o_model_kwargs, examples=examples) +gpt_4o_zero_shot = experiment( + num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4o_model_kwargs +) +gpt_4o_3_shot = experiment( + num_questions=num_questions, + dataset=dataset, + model_kwargs=gpt_4o_model_kwargs, + examples=examples, +) # gpt_4_turbo_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4_turbo_model_kwargs) # gpt_4_turbo_3_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4_turbo_model_kwargs, examples=examples) @@ -268,17 +289,16 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: # llama3_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=llama3_model_kwargs) # llama3_3_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=llama3_model_kwargs, examples=examples) -# print(f"gpt_3_5_zero_shot: {gpt_3_5_zero_shot}") -# print(f"gpt_3_5_3_shot: {gpt_3_5_3_shot}") -# print(f"gpt_3_5_6_shot: {gpt_3_5_6_shot}") +# print(f"gpt_3_5_zero_shot: {gpt_3_5_zero_shot}") +# print(f"gpt_3_5_3_shot: {gpt_3_5_3_shot}") +# print(f"gpt_3_5_6_shot: {gpt_3_5_6_shot}") print(f"gpt_4o_zero_shot: {gpt_4o_zero_shot}") print(f"gpt_4o_3_shot: {gpt_4o_3_shot}") # print(f"gpt_4_turbo_zero_shot: {gpt_4_turbo_zero_shot}") # print(f"gpt_4_turbo_3_shot: {gpt_4_turbo_3_shot}") -# print(f"llama3_zero_shot: {llama3_zero_shot}") -# print(f"llama3_3_shot: {llama3_3_shot}") +# print(f"llama3_zero_shot: {llama3_zero_shot}") +# print(f"llama3_3_shot: {llama3_3_shot}") - """ NOTE: llama3 time might not accurate because it has request limit error @@ -306,4 +326,4 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: gpt_4_turbo_zero_shot: 0 10 0.0 36.0502730846405 gpt_4_turbo_3_shot: 8 10 0.8 7.756631708145141 -""" \ No newline at end of file +""" diff --git a/benchmarks/ReAct_agent/hotpotQA/hotpotqa.py b/benchmarks/ReAct_agent/hotpotQA/hotpotqa.py index 5e949e5a..ba224ea5 100644 --- a/benchmarks/ReAct_agent/hotpotQA/hotpotqa.py +++ b/benchmarks/ReAct_agent/hotpotQA/hotpotqa.py @@ -7,28 +7,27 @@ """ import dotenv -from components.api_client.openai_client import OpenAIClient +from components.model_client.openai_client import OpenAIClient from components.agent.react_agent import ReActAgent -from core.tool_helper import FunctionTool -from components.api_client import GroqAPIClient +from core.func_tool import FunctionTool +from components.model_client import GroqAPIClient import time from benchmarks.ReAct_agent.utils.tools import search, lookup, normalize_answer from eval.evaluators import AnswerMacthEvaluator import logging import json -from typing import List, Union, Callable, Optional, Any, Dict -from core.tool_helper import FunctionTool, AsyncCallable +from typing import List, Optional, Any, Dict logger = logging.getLogger(__name__) -logging.basicConfig(filename='./logs/hotpot.log', level=logging.INFO) +logging.basicConfig(filename="./logs/hotpot.log", level=logging.INFO) # load evironment dotenv.load_dotenv(dotenv_path=".env", override=True) # Reference: paper's instruction prompt. (we use our default DEFAULT_REACT_AGENT_SYSTEM_PROMPT) -# instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: +# instruction = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: # (1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. # (2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. # (3) Finish[answer], which returns the answer and finishes the task. @@ -39,7 +38,7 @@ # setup examples for few-shot experiment # 6 examples from the paper's source code(transformed the format to use in LightRAG) examples = [ -"""Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into? + """Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into? Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area. Action 1: search("Colorado orogeny") Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas. @@ -54,16 +53,16 @@ Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3] Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft. Action 5: finish("1,800 to 7,000 ft")""", -"""Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who? + """Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who? Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after. Action 1: search("Milhouse") Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening. Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after". Action 2: lookup("named after") -Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. +Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Action 3: finish("Richard Nixon")""", -"""Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture? + """Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture? Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups. Action 1: search("Adam Clayton Powell") Observation 1: Could not find ["Adam Clayton Powell"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito']. @@ -73,7 +72,7 @@ The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience. Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture. Action 3: finish("The Saimaa Gesture")""", -"""Question: What profession does Nicholas Ray and Elia Kazan have in common? + """Question: What profession does Nicholas Ray and Elia Kazan have in common? Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common. Action 1: search("Nicholas Ray") Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause. @@ -82,24 +81,24 @@ Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor. Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor. Action 3: finish("director, screenwriter, actor")""", -"""Question: Which magazine was started first Arthur's Magazine or First for Women? + """Question: Which magazine was started first Arthur's Magazine or First for Women? Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first. Action 1: search("Arthur's Magazine") -Observation 1: Arthur's Magazine (1844-€“1846) was an American literary periodical published in Philadelphia in the 19th century. +Observation 1: Arthur's Magazine (1844-€“1846) was an American literary periodical published in Philadelphia in the 19th century. Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next. Action 2: search("First for Women") -Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. +Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first. Action 3: finish("Arthur's Magazine")""", -"""Question: Were Pavel Urysohn and Leonid Levin known for the same type of work? + """Question: Were Pavel Urysohn and Leonid Levin known for the same type of work? Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same. Action 1: search("Pavel Urysohn") Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 – August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory. Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work. Action 2: search("Leonid Levin") -Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. -Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. -Action 3: finish("yes")""" +Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. +Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. +Action 3: finish("yes")""", ] @@ -114,20 +113,28 @@ def config_agent(model_kwargs: Dict, examples: Optional[List[str]] = []) -> ReAc Returns: ReActAgent: the configured agent """ - - preset_prompt_kwargs = {'examples': examples} if len(examples) else {} - + + preset_prompt_kwargs = {"examples": examples} if len(examples) else {} + # set up tools - tools = [FunctionTool.from_defaults(fn=search), FunctionTool.from_defaults(fn=lookup)] - model_client = OpenAIClient if 'gpt' in model_kwargs.get('model', '') else GroqAPIClient - + tools = [ + FunctionTool.from_defaults(fn=search), + FunctionTool.from_defaults(fn=lookup), + ] + model_client = ( + OpenAIClient if "gpt" in model_kwargs.get("model", "") else GroqAPIClient + ) + return ReActAgent( - tools=tools, max_steps=7, model_client=model_client, - model_kwargs=model_kwargs, preset_prompt_kwargs=preset_prompt_kwargs + tools=tools, + max_steps=7, + model_client=model_client, + model_kwargs=model_kwargs, + preset_prompt_kwargs=preset_prompt_kwargs, ) -def run_query(agent: ReActAgent, question: str, gt_answer:str) -> Dict[str, float]: +def run_query(agent: ReActAgent, question: str, gt_answer: str) -> Dict[str, float]: """ Run queries and calculate the evaluation metrics """ @@ -135,19 +142,27 @@ def run_query(agent: ReActAgent, question: str, gt_answer:str) -> Dict[str, floa pred_answer = agent(question) pred_answer = normalize_answer(pred_answer) elapsed_time = time.time() - start_time - - logger.info(f"Question: {question}, \ngt_answer: {gt_answer}, \npred_answer: {pred_answer}\n") - - em = EM_evaluator.compute_match_acc_single_query(pred_answer=pred_answer, gt_answer=gt_answer) - fm = FM_evaluator.compute_match_acc_single_query(pred_answer=pred_answer, gt_answer=gt_answer) - - return { - "EM": em, - "FM": fm, - "time": elapsed_time - } - -def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: Dict, examples: Optional[List[str]] = []) -> Dict[str, float]: + + logger.info( + f"Question: {question}, \ngt_answer: {gt_answer}, \npred_answer: {pred_answer}\n" + ) + + em = EM_evaluator.compute_match_acc_single_query( + pred_answer=pred_answer, gt_answer=gt_answer + ) + fm = FM_evaluator.compute_match_acc_single_query( + pred_answer=pred_answer, gt_answer=gt_answer + ) + + return {"EM": em, "FM": fm, "time": elapsed_time} + + +def experiment( + num_questions: int, + dataset: List[Dict[str, Any]], + model_kwargs: Dict, + examples: Optional[List[str]] = [], +) -> Dict[str, float]: """ Perform react agent experiment, evaluation metrics are Exact Match and Fuzzy Match @@ -160,19 +175,19 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: Returns: Dict[str, float]: return the evaluations """ - + logger.info(f"model_kwargs: {model_kwargs}") - + # Initialize the agent once if configuration does not need to change each iteration react_agent = config_agent(model_kwargs=model_kwargs, examples=examples) total_metrics = {"N": 0, "EM": 0, "FM": 0, "time": 0} for i in range(num_questions): question = dataset[i]["question"] gt_answer = normalize_answer(dataset[i]["answer"]) - + result = run_query(react_agent, question, gt_answer) - total_metrics["N"] += 1 # number of questions + total_metrics["N"] += 1 # number of questions total_metrics["EM"] += result["EM"] total_metrics["FM"] += result["FM"] total_metrics["time"] += result["time"] @@ -184,46 +199,47 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: return average_metrics - # setup evaluators EM_evaluator = AnswerMacthEvaluator(type="exact_match") FM_evaluator = AnswerMacthEvaluator(type="fuzzy_match") # load test data -file = open('./tests/benchmark/ReAct_agent/paper_data/hotpot_dev_v1_simplified_random_100.json') +file = open( + "./tests/benchmark/ReAct_agent/paper_data/hotpot_dev_v1_simplified_random_100.json" +) dataset = json.load(file) # define the arguments, follow the paper's argument settings gpt_model_kwargs = { - "model": "gpt-3.5-turbo", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-3.5-turbo", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} gpt_4o_model_kwargs = { - "model": "gpt-4o", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-4o", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} gpt_4_turbo_model_kwargs = { - "model": "gpt-4-turbo-preview", - "temperature": 0.0, - "max_tokens": 100, - "top_p": 1, - "frequency_penalty":0.0, - "presence_penalty":0.0, - } + "model": "gpt-4-turbo-preview", + "temperature": 0.0, + "max_tokens": 100, + "top_p": 1, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, +} llama3_model_kwargs = { - "model": "llama3-70b-8192", # llama3 is not good with string formatting, llama3 8b is also bad at following instruction, 70b is better but still not as good as gpt-3.5-turbo - "temperature": 0.0, - } + "model": "llama3-70b-8192", # llama3 is not good with string formatting, llama3 8b is also bad at following instruction, 70b is better but still not as good as gpt-3.5-turbo + "temperature": 0.0, +} num_questions = 5 # gpt_3_5_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_model_kwargs) @@ -234,22 +250,26 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: # gpt_4o_6_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4o_model_kwargs, examples=examples) # gpt_4_turbo_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4_turbo_model_kwargs) -gpt_4_turbo_6_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=gpt_4_turbo_model_kwargs, examples=examples) +gpt_4_turbo_6_shot = experiment( + num_questions=num_questions, + dataset=dataset, + model_kwargs=gpt_4_turbo_model_kwargs, + examples=examples, +) # llama3_zero_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=llama3_model_kwargs) # llama3_6_shot = experiment(num_questions=num_questions, dataset=dataset, model_kwargs=llama3_model_kwargs, examples=examples) -# print(f"gpt_3_5_zero_shot: {gpt_3_5_zero_shot}") -# print(f"gpt_3_5_3_shot: {gpt_3_5_3_shot}") -# print(f"gpt_3_5_6_shot: {gpt_3_5_6_shot}") +# print(f"gpt_3_5_zero_shot: {gpt_3_5_zero_shot}") +# print(f"gpt_3_5_3_shot: {gpt_3_5_3_shot}") +# print(f"gpt_3_5_6_shot: {gpt_3_5_6_shot}") # print(f"gpt_4o_zero_shot: {gpt_4o_zero_shot}") # print(f"gpt_4o_6_shot: {gpt_4o_6_shot}") # print(f"gpt_4_turbo_zero_shot: {gpt_4_turbo_zero_shot}") print(f"gpt_4_turbo_6_shot: {gpt_4_turbo_6_shot}") -# print(f"llama3_zero_shot: {llama3_zero_shot}") -# print(f"llama3_6_shot: {llama3_6_shot}") +# print(f"llama3_zero_shot: {llama3_zero_shot}") +# print(f"llama3_6_shot: {llama3_6_shot}") - """ NOTE: llama3 time might not accurate because it has request limit error @@ -276,4 +296,4 @@ def experiment(num_questions: int, dataset: List[Dict[str, Any]], model_kwargs: randomly selected 100 records gpt_3_5_zero_shot: {'EM': 0.02, 'FM': 0.23, 'time': 16.584252796173097, 'Average step': 5.93} gpt_3_5_6_shot: {'EM': 0.02, 'FM': 0.09, 'time': 10.081220099925995, 'Average step': 6.78} -""" \ No newline at end of file +""" diff --git a/class_hierarchy_edges.csv b/class_hierarchy_edges.csv new file mode 100644 index 00000000..a9348645 --- /dev/null +++ b/class_hierarchy_edges.csv @@ -0,0 +1,68 @@ +Component,ListParser +Component,JsonParser +Component,YamlParser +Component,ToolManager +Component,Prompt +Component,ModelClient +Component,Retriever +Component,FunctionTool +Component,Tokenizer +Component,Generator +Component,Embedder +Component,BatchEmbedder +Component,Sequential +Component,FunComponent +Component,ReActAgent +Component,OutputParser +Component,TextSplitter +Component,DocumentSplitter +Component,ToEmbeddings +Component,RetrieverOutputToContextStr +Component,DefaultLLMJudge +Component,LLMAugmenter +Generic,LocalDB +Generic,Retriever +Generic,GeneratorOutput +Generic,Parameter +Generic,Sample +Generic,Sampler +Generic,RandomSampler +Generic,ClassSampler +ModelClient,CohereAPIClient +ModelClient,TransformersClient +ModelClient,GroqAPIClient +ModelClient,GoogleGenAIClient +ModelClient,OpenAIClient +ModelClient,AnthropicAPIClient +Retriever,BM25Retriever +Retriever,PostgresRetriever +Retriever,RerankerRetriever +Retriever,LLMRetriever +Retriever,FAISSRetriever +Enum,DataClassFormatType +Enum,ModelType +Enum,DistanceToOperator +Enum,OptionalPackages +DataClass,EmbedderOutput +DataClass,GeneratorOutput +DataClass,RetrieverOutput +DataClass,FunctionDefinition +DataClass,Function +DataClass,FunctionExpression +DataClass,FunctionOutput +DataClass,StepOutput +DataClass,Document +DataClass,DialogTurn +DataClass,Instruction +DataClass,GeneratorStatesRecord +DataClass,GeneratorCallRecord +Generator,CoTGenerator +Generator,CoTGeneratorWithJsonOutput +OutputParser,YamlOutputParser +OutputParser,JsonOutputParser +OutputParser,ListOutputParser +OutputParser,BooleanOutputParser +Optimizer,BootstrapFewShot +Optimizer,LLMOptimizer +Sampler,RandomSampler +Sampler,ClassSampler diff --git a/developer_notes/__init__.py b/developer_notes/__init__.py index e69de29b..d33bab7c 100644 --- a/developer_notes/__init__.py +++ b/developer_notes/__init__.py @@ -0,0 +1,3 @@ +from lightrag.utils import setup_env + +setup_env() diff --git a/lightrag/components/__init__.py b/developer_notes/agent.py similarity index 100% rename from lightrag/components/__init__.py rename to developer_notes/agent.py diff --git a/developer_notes/dataclass.ipynb b/developer_notes/dataclass.ipynb new file mode 100644 index 00000000..39625374 --- /dev/null +++ b/developer_notes/dataclass.ipynb @@ -0,0 +1,550 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass, field\n", + "\n", + "@dataclass\n", + "class Question:\n", + " question: str = field(\n", + " metadata={\"desc\": \"The question asked by the user\"}\n", + " )\n", + " metadata: dict = field(\n", + " metadata={\"desc\": \"The metadata of the question\"}, default_factory=dict\n", + " )\n", + "\n", + "@dataclass\n", + "class TrecData:\n", + " question: Question = field(\n", + " metadata={\"desc\": \"The question asked by the user\"}\n", + " ) # Required field, you have to provide the question field at the instantiation\n", + " label: int = field(\n", + " metadata={\"desc\": \"The label of the question\"}, default=0\n", + " ) # Optional field" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TrecData(question=Question(question='What is the capital of France?', metadata={}), label=1)\n", + "{'question': {'question': 'What is the capital of France?', 'metadata': {}}, 'label': 1}\n", + "TrecData(question={'question': 'What is the capital of France?', 'metadata': {}}, label=1)\n", + "False\n" + ] + } + ], + "source": [ + "# dataclass itself is powerful, but it can not reconstruct nested dataclass\n", + "example = TrecData(Question(\"What is the capital of France?\"), 1)\n", + "print(example)\n", + "\n", + "from dataclasses import asdict\n", + "print(asdict(example))\n", + "reconstructed = TrecData(**asdict(example))\n", + "print(reconstructed)\n", + "print(reconstructed == example)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "non-default argument 'metadata' follows default argument", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# it does not allow required field after optional field\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;129;43m@dataclass\u001b[39;49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;43;01mclass\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43;01mTrecData2\u001b[39;49;00m\u001b[43m:\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mQuestion\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfield\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdesc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mThe question asked by the user\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Required field, you have to provide the question field at the instantiation\u001b[39;49;00m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mfield\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdesc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mThe label of the question\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Optional field\u001b[39;49;00m\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/dataclasses.py:1230\u001b[0m, in \u001b[0;36mdataclass\u001b[0;34m(cls, init, repr, eq, order, unsafe_hash, frozen, match_args, kw_only, slots, weakref_slot)\u001b[0m\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m wrap\n\u001b[1;32m 1229\u001b[0m \u001b[38;5;66;03m# We're called as @dataclass without parens.\u001b[39;00m\n\u001b[0;32m-> 1230\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrap\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/dataclasses.py:1220\u001b[0m, in \u001b[0;36mdataclass..wrap\u001b[0;34m(cls)\u001b[0m\n\u001b[1;32m 1219\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap\u001b[39m(\u001b[38;5;28mcls\u001b[39m):\n\u001b[0;32m-> 1220\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_process_class\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mrepr\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43munsafe_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1221\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrozen\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmatch_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkw_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mslots\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1222\u001b[0m \u001b[43m \u001b[49m\u001b[43mweakref_slot\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/dataclasses.py:1027\u001b[0m, in \u001b[0;36m_process_class\u001b[0;34m(cls, init, repr, eq, order, unsafe_hash, frozen, match_args, kw_only, slots, weakref_slot)\u001b[0m\n\u001b[1;32m 1022\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m init:\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;66;03m# Does this class have a post-init function?\u001b[39;00m\n\u001b[1;32m 1024\u001b[0m has_post_init \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, _POST_INIT_NAME)\n\u001b[1;32m 1026\u001b[0m _set_new_attribute(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__init__\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m-> 1027\u001b[0m \u001b[43m_init_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mall_init_fields\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1028\u001b[0m \u001b[43m \u001b[49m\u001b[43mstd_init_fields\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1029\u001b[0m \u001b[43m \u001b[49m\u001b[43mkw_only_init_fields\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1030\u001b[0m \u001b[43m \u001b[49m\u001b[43mfrozen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1031\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_post_init\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1032\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# The name to use for the \"self\"\u001b[39;49;00m\n\u001b[1;32m 1033\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# param in __init__. Use \"self\"\u001b[39;49;00m\n\u001b[1;32m 1034\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# if possible.\u001b[39;49;00m\n\u001b[1;32m 1035\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m__dataclass_self__\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mself\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfields\u001b[49m\n\u001b[1;32m 1036\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mself\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1037\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mglobals\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1038\u001b[0m \u001b[43m \u001b[49m\u001b[43mslots\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1039\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 1041\u001b[0m \u001b[38;5;66;03m# Get the fields as a list, and include only real fields. This is\u001b[39;00m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;66;03m# used in all of the following methods.\u001b[39;00m\n\u001b[1;32m 1043\u001b[0m field_list \u001b[38;5;241m=\u001b[39m [f \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m fields\u001b[38;5;241m.\u001b[39mvalues() \u001b[38;5;28;01mif\u001b[39;00m f\u001b[38;5;241m.\u001b[39m_field_type \u001b[38;5;129;01mis\u001b[39;00m _FIELD]\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/dataclasses.py:545\u001b[0m, in \u001b[0;36m_init_fn\u001b[0;34m(fields, std_fields, kw_only_fields, frozen, has_post_init, self_name, globals, slots)\u001b[0m\n\u001b[1;32m 543\u001b[0m seen_default \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 544\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m seen_default:\n\u001b[0;32m--> 545\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnon-default argument \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 546\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfollows default argument\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28mlocals\u001b[39m \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_type_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m: f\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m fields}\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28mlocals\u001b[39m\u001b[38;5;241m.\u001b[39mupdate({\n\u001b[1;32m 550\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMISSING\u001b[39m\u001b[38;5;124m'\u001b[39m: MISSING,\n\u001b[1;32m 551\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_HAS_DEFAULT_FACTORY\u001b[39m\u001b[38;5;124m'\u001b[39m: _HAS_DEFAULT_FACTORY,\n\u001b[1;32m 552\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__dataclass_builtins_object__\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mobject\u001b[39m,\n\u001b[1;32m 553\u001b[0m })\n", + "\u001b[0;31mTypeError\u001b[0m: non-default argument 'metadata' follows default argument" + ] + } + ], + "source": [ + "# it does not allow required field after optional field\n", + "@dataclass\n", + "class TrecData2:\n", + " question: Question = field(\n", + " metadata={\"desc\": \"The question asked by the user\"}\n", + " ) # Required field, you have to provide the question field at the instantiation\n", + " label: int = field(\n", + " metadata={\"desc\": \"The label of the question\"}, default=0\n", + " ) # Optional field\n", + " metadata: dict = field(\n", + " metadata={\"desc\": \"The metadata of the question\"}\n", + " ) # required field" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# lets see what DataClass can do \n", + "# 1. allow required field after optional field using required_field on default_factory\n", + "\n", + "from lightrag.core import DataClass, required_field\n", + "\n", + "@dataclass\n", + "class TrecData2(DataClass):\n", + " question: Question = field(\n", + " metadata={\"desc\": \"The question asked by the user\"}\n", + " ) # Required field, you have to provide the question field at the instantiation\n", + " label: int = field(\n", + " metadata={\"desc\": \"The label of the question\"}, default=0\n", + " ) # Optional field\n", + " metadata: dict = field(\n", + " metadata={\"desc\": \"The metadata of the question\"}, default_factory=required_field()\n", + " ) # required field" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'})\n", + "{'question': {'question': 'What is the capital of France?', 'metadata': {}}, 'label': 1, 'metadata': {'key': 'value'}}\n", + "{'question': {'question': 'What is the capital of France?', 'metadata': {}}, 'label': 1, 'metadata': {'key': 'value'}}\n", + "TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'})\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "# 2. allow you to reconstructed nested dataclass\n", + "# You just have to make sure the class you are handling are subclass of DataClass, the child class can be native dataclass\n", + "\n", + "example = TrecData2(Question(\"What is the capital of France?\"), 1, {\"key\": \"value\"})\n", + "print(example)\n", + "\n", + "dict_example = TrecData2.to_dict(example) # use as if its a class method\n", + "print(dict_example)\n", + "\n", + "dict_example_2 = example.to_dict() # use it as instance method\n", + "print(dict_example)\n", + "\n", + "reconstructed = TrecData2.from_dict(dict_example)\n", + "print(reconstructed)\n", + "\n", + "print(reconstructed == example)\n", + "print(dict_example == dict_example_2)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'question': {'question': 'What is the capital of France?', 'metadata': {}}, 'label': 1}\n", + "{'question': {'question': 'What is the capital of France?'}, 'label': 1}\n" + ] + } + ], + "source": [ + "# Lets exclude fields too\n", + "\n", + "# Note: better not to exclude required fields, as it will run issues using from_dict\n", + "# you can use it if you dont mind to reconstruct\n", + "\n", + "# exclude field of only the parent class\n", + "dict_exclude = example.to_dict(exclude=[\"metadata\"])\n", + "print(dict_exclude)\n", + "\n", + "# exclude field of the parent and child class\n", + "dict_exclude = example.to_dict(exclude={\"TrecData2\": [\"metadata\"], \"Question\": [\"metadata\"]})\n", + "print(dict_exclude)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": {\n", + " \"question\": \"What is the capital of France?\",\n", + " \"metadata\": {}\n", + " },\n", + " \"label\": 1,\n", + " \"metadata\": {\n", + " \"key\": \"value\"\n", + " }\n", + "}\n", + "question:\n", + " question: What is the capital of France?\n", + " metadata: {}\n", + "label: 1\n", + "metadata:\n", + " key: value\n", + "\n", + "TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'})\n", + "True\n", + "TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'})\n", + "True\n" + ] + } + ], + "source": [ + "# lets do the yaml and json string for demonstraing the data example\n", + "\n", + "json_str = example.to_json()\n", + "print(json_str)\n", + "\n", + "yaml_str = example.to_yaml()\n", + "print(yaml_str)\n", + "\n", + "reconstructed_from_json = TrecData2.from_json(json_str)\n", + "print(reconstructed_from_json)\n", + "print(reconstructed_from_json == example)\n", + "\n", + "reconstructed_from_yaml = TrecData2.from_yaml(yaml_str)\n", + "print(reconstructed_from_yaml)\n", + "print(reconstructed_from_yaml == example)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": {\n", + " \"question\": \"What is the capital of France?\",\n", + " \"metadata\": {}\n", + " },\n", + " \"label\": 1,\n", + " \"metadata\": {\n", + " \"key\": \"value\"\n", + " }\n", + "}\n", + "question:\n", + " question: What is the capital of France?\n", + " metadata: {}\n", + "label: 1\n", + "metadata:\n", + " key: value\n", + "\n" + ] + } + ], + "source": [ + "# use with DataClassFormatType and format_example_str\n", + "\n", + "from lightrag.core import DataClassFormatType\n", + "\n", + "example_str = example.format_example_str(DataClassFormatType.EXAMPLE_JSON)\n", + "print(example_str)\n", + "\n", + "example_str = example.format_example_str(DataClassFormatType.EXAMPLE_YAML)\n", + "print(example_str)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': 'TrecData2',\n", + " 'properties': {'question': {'type': \"{'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}\",\n", + " 'desc': 'The question asked by the user'},\n", + " 'label': {'type': 'int', 'desc': 'The label of the question'},\n", + " 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}},\n", + " 'required': ['question', 'metadata']}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Now, lets check the data format using class method without instance\n", + "# schema, you can choose to only use properties \n", + "\n", + "schema = TrecData2.to_schema()\n", + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': 'TrecData2',\n", + " 'properties': {'question': {'type': \"{'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}}, 'required': ['question']}\",\n", + " 'desc': 'The question asked by the user'},\n", + " 'label': {'type': 'int', 'desc': 'The label of the question'}},\n", + " 'required': ['question']}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# schema with exclude\n", + "schema_exclude = TrecData2.to_schema(exclude={\"TrecData2\": [\"metadata\"], \"Question\": [\"metadata\"]})\n", + "schema_exclude" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": \"The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)\",\n", + " \"label\": \"The label of the question (int) (optional)\",\n", + " \"metadata\": \"The metadata of the question (dict) (required)\"\n", + "}\n" + ] + } + ], + "source": [ + "# signature, json_signature\n", + "\n", + "json_signature = TrecData2.to_json_signature()\n", + "print(json_signature)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": \"The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}}, 'required': ['question']}) (required)\",\n", + " \"label\": \"The label of the question (int) (optional)\"\n", + "}\n" + ] + } + ], + "source": [ + "# exclude field of the parent and child class\n", + "\n", + "json_signature_exclude = TrecData2.to_json_signature(exclude={\"TrecData2\": [\"metadata\"], \"Question\": [\"metadata\"]})\n", + "print(json_signature_exclude)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": \"The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)\",\n", + " \"label\": \"The label of the question (int) (optional)\"\n", + "}\n" + ] + } + ], + "source": [ + "# only exclude the parent class\n", + "\n", + "json_signature_exclude = TrecData2.to_json_signature(exclude=[\"metadata\"])\n", + "print(json_signature_exclude)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "question: The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)\n", + "label: The label of the question (int) (optional)\n", + "metadata: The metadata of the question (dict) (required)\n" + ] + } + ], + "source": [ + "# signature, yaml_signature\n", + "\n", + "yaml_signature = TrecData2.to_yaml_signature()\n", + "print(yaml_signature)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"question\": \"The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)\",\n", + " \"label\": \"The label of the question (int) (optional)\",\n", + " \"metadata\": \"The metadata of the question (dict) (required)\"\n", + "}\n", + "question: The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)\n", + "label: The label of the question (int) (optional)\n", + "metadata: The metadata of the question (dict) (required)\n", + "{\n", + " \"type\": \"TrecData2\",\n", + " \"properties\": {\n", + " \"question\": {\n", + " \"type\": \"{'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}\",\n", + " \"desc\": \"The question asked by the user\"\n", + " },\n", + " \"label\": {\n", + " \"type\": \"int\",\n", + " \"desc\": \"The label of the question\"\n", + " },\n", + " \"metadata\": {\n", + " \"type\": \"dict\",\n", + " \"desc\": \"The metadata of the question\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"question\",\n", + " \"metadata\"\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "# use the DataClassFormatType to control it \n", + "\n", + "from lightrag.core import DataClassFormatType\n", + "\n", + "json_signature = TrecData2.format_class_str(DataClassFormatType.SIGNATURE_JSON)\n", + "print(json_signature)\n", + "\n", + "yaml_signature = TrecData2.format_class_str(DataClassFormatType.SIGNATURE_YAML)\n", + "print(yaml_signature)\n", + "\n", + "schema = TrecData2.format_class_str(DataClassFormatType.SCHEMA)\n", + "print(schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OutputFormat(thought=None, class_name='Entity', class_index=1)\n" + ] + } + ], + "source": [ + "# load with customizd from dict\n", + "from typing import Dict\n", + "@dataclass\n", + "class OutputFormat(DataClass):\n", + " thought: str = field(\n", + " metadata={\n", + " \"desc\": \"Your reasoning to classify the question to class_name\",\n", + " }\n", + " )\n", + " class_name: str = field(metadata={\"desc\": \"class_name\"})\n", + " class_index: int = field(metadata={\"desc\": \"class_index in range[0, 5]\"})\n", + "\n", + " @classmethod\n", + " def from_dict(cls, data: Dict[str, object]):\n", + " _COARSE_LABELS_DESC = [\n", + " \"Abbreviation\",\n", + " \"Entity\",\n", + " \"Description and abstract concept\",\n", + " \"Human being\",\n", + " \"Location\",\n", + " \"Numeric value\",\n", + " ]\n", + " data = {\n", + " \"thought\": None,\n", + " \"class_index\": data[\"coarse_label\"],\n", + " \"class_name\": _COARSE_LABELS_DESC[data[\"coarse_label\"]],\n", + " }\n", + " return super().from_dict(data)\n", + "\n", + "data = OutputFormat.from_dict({\"coarse_label\": 1})\n", + "print(data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lightrag-project", + "language": "python", + "name": "light-rag-project" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/developer_notes/generator.ipynb b/developer_notes/generator.ipynb index 67eb62a6..ff804b22 100644 --- a/developer_notes/generator.ipynb +++ b/developer_notes/generator.ipynb @@ -74,10 +74,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data='LightRAG is a light-based Real-time Anomaly Generator, which is a special type of anomaly detection system. It uses a combination of visual and statistical techniques to detect unusual patterns or outliers in a dataset in real-time, often for purposes such as identifying security threats, detecting fraud, or monitoring system performance. Would you like to know more about its applications or how it works?', error=None, usage=None, raw_response='LightRAG is a light-based Real-time Anomaly Generator, which is a special type of anomaly detection system. It uses a combination of visual and statistical techniques to detect unusual patterns or outliers in a dataset in real-time, often for purposes such as identifying security threats, detecting fraud, or monitoring system performance. Would you like to know more about its applications or how it works?')\n" + ] + } + ], + "source": [ + "from lightrag.core import Component, Generator, Prompt\n", + "from lightrag.components.model_client import GroqAPIClient\n", + "from lightrag.utils import setup_env # noqa\n", + "\n", + "\n", + "class SimpleQA(Component):\n", + " def __init__(self):\n", + " super().__init__()\n", + " template = r\"\"\"\n", + " You are a helpful assistant.\n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " \"\"\"\n", + " self.generator = Generator(\n", + " model_client=GroqAPIClient(), model_kwargs={\"model\": \"llama3-8b-8192\"}, template=template\n", + " )\n", + "\n", + " def call(self, query):\n", + " return self.generator({\"input_str\": query})\n", + "\n", + " async def acall(self, query):\n", + " return await self.generator.acall({\"input_str\": query})\n", + "\n", + "\n", + "qa = SimpleQA()\n", + "answer = qa(\"What is LightRAG?\")\n", + "\n", + "print(answer)" + ] } ], "metadata": { diff --git a/developer_notes/generator_note.py b/developer_notes/generator_note.py new file mode 100644 index 00000000..2fa89251 --- /dev/null +++ b/developer_notes/generator_note.py @@ -0,0 +1,181 @@ +from lightrag.core import Component, Generator +from lightrag.components.model_client import GroqAPIClient + + +class SimpleQA(Component): + def __init__(self): + super().__init__() + template = r""" + You are a helpful assistant. + + User: {{input_str}} + You: + """ + self.generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + template=template, + ) + + def call(self, query): + return self.generator({"input_str": query}) + + async def acall(self, query): + return await self.generator.acall({"input_str": query}) + + +def minimum_generator(): + from lightrag.core import Generator + from lightrag.components.model_client import GroqAPIClient + + generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + ) + print(generator) + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +def use_a_json_parser(): + from lightrag.core import Generator + from lightrag.core.types import GeneratorOutput + from lightrag.components.model_client import OpenAIClient + from lightrag.core.string_parser import JsonParser + + output_format_str = """Your output should be formatted as a standard JSON object with two keys: + { + "explaination": "A brief explaination of the concept in one sentence.", + "example": "An example of the concept in a sentence." + } + """ + + generator = Generator( + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + prompt_kwargs={"output_format_str": output_format_str}, + output_processors=JsonParser(), + ) + + prompt_kwargs = {"input_str": "What is LLM?"} + generator.print_prompt(**prompt_kwargs) + + output: GeneratorOutput = generator(prompt_kwargs=prompt_kwargs) + print(output) + print(type(output.data)) + print(output.data) + + +def use_its_own_template(): + from lightrag.core import Generator + from lightrag.components.model_client import GroqAPIClient + + template = r"""{{task_desc_str}} + User: {{input_str}} + You:""" + generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + template=template, + prompt_kwargs={"task_desc_str": "You are a helpful assistant"}, + ) + + prompt_kwargs = {"input_str": "What is LLM?"} + + generator.print_prompt( + **prompt_kwargs, + ) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +def use_model_client_enum_to_switch_client(): + from lightrag.core import Generator + from lightrag.core.types import ModelClientType + + generator = Generator( + model_client=ModelClientType.OPENAI(), # or ModelClientType.GROQ() + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + print(generator) + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +def create_purely_from_config(): + + from lightrag.utils.config import new_component + from lightrag.core import Generator + + config = { + "generator": { + "component_name": "Generator", + "component_config": { + "model_client": { + "component_name": "GroqAPIClient", + "component_config": {}, + }, + "model_kwargs": { + "model": "llama3-8b-8192", + }, + }, + } + } + + generator: Generator = new_component(config["generator"]) + print(generator) + + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +def create_purely_from_config_2(): + + from lightrag.core import Generator + + config = { + "model_client": { + "component_name": "GroqAPIClient", + "component_config": {}, + }, + "model_kwargs": { + "model": "llama3-8b-8192", + }, + } + + generator: Generator = Generator.from_config(config) + print(generator) + + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +if __name__ == "__main__": + qa = SimpleQA() + answer = qa("What is LightRAG?") + print(qa) + + minimum_generator() + use_a_json_parser() + use_its_own_template() + use_model_client_enum_to_switch_client() + create_purely_from_config() + create_purely_from_config_2() diff --git a/developer_notes/prompt_note.py b/developer_notes/prompt_note.py new file mode 100644 index 00000000..cb4bb167 --- /dev/null +++ b/developer_notes/prompt_note.py @@ -0,0 +1,84 @@ +def python_str_format_example(task_desc_str: str, input_str: str): + + # percent(%) formatting + print("%s User: %s" % (task_desc_str, input_str)) + + # format() method with kwargs + print( + "{task_desc_str} User: {input_str}".format( + task_desc_str=task_desc_str, input_str=input_str + ) + ) + + # f-string + print(f"{task_desc_str} User: {input_str}") + + # Templates + from string import Template + + t = Template("$task_desc_str User: $input_str") + print(t.substitute(task_desc_str=task_desc_str, input_str=input_str)) + + +def jinja2_template_example(template, **kwargs): + from jinja2 import Template + + t = Template(template, trim_blocks=True, lstrip_blocks=True) + print(t.render(**kwargs)) + + +def lightrag_prompt(template, task_desc_str, input_str, tools=None): + from lightrag.core.prompt_builder import Prompt + + prompt = Prompt( + template=template, + prompt_kwargs={ + "task_desc_str": task_desc_str, + "tools": tools, + }, + ) + print(prompt) + print(prompt(input_str=input_str)) + + saved_prompt = prompt.to_dict() + restored_prompt = Prompt.from_dict(saved_prompt) + print( + restored_prompt == prompt + ) # False as the jinja2 template can not be serialized, but we recreated the template from the string at the time of restoration, so it works the same + print(restored_prompt) + + +def lightrag_default_prompt(): + from lightrag.core.prompt_builder import Prompt + + prompt = Prompt() + input_str = "What is the capital of France?" + output = prompt(input_str=input_str) + print(output) + + +if __name__ == "__main__": + + task_desc_str = "You are a helpful assitant" + input_str = "What is the capital of France?" + tools = ["google", "wikipedia", "wikidata"] + template = r"""{{ task_desc_str }} +{# tools #} +{% if tools %} + +{% for tool in tools %} +{{loop.index}}. {{ tool }} +{% endfor %} + +{% endif %} +User: {{ input_str }}""" + python_str_format_example(task_desc_str, input_str) + jinja2_template_example(template, task_desc_str=task_desc_str, input_str=input_str) + jinja2_template_example( + template, task_desc_str=task_desc_str, input_str=input_str, tools=tools + ) + lightrag_prompt( + template, task_desc_str=task_desc_str, input_str=input_str, tools=tools + ) + + lightrag_default_prompt() diff --git a/developer_notes/retriever.ipynb b/developer_notes/retriever.ipynb index b5d3da32..f0b49aac 100644 --- a/developer_notes/retriever.ipynb +++ b/developer_notes/retriever.ipynb @@ -225,11 +225,11 @@ } ], "source": [ - "from lightrag.components.retriever import InMemoryBM25Retriever\n", + "from lightrag.components.retriever import BM25Retriever\n", "\n", "document_map_func = lambda x: x[\"content\"]\n", "\n", - "bm25_retriever = InMemoryBM25Retriever(top_k=2, documents=documents, document_map_func=document_map_func)\n", + "bm25_retriever = BM25Retriever(top_k=2, documents=documents, document_map_func=document_map_func)\n", "print(bm25_retriever)" ] }, @@ -2581,11 +2581,11 @@ } ], "source": [ - "from lightrag.components.retriever import InMemoryBM25Retriever\n", + "from lightrag.components.retriever import BM25Retriever\n", "\n", "index_strings = [doc.text for doc in documents]\n", "\n", - "retriever = InMemoryBM25Retriever(documents=index_strings)\n", + "retriever = BM25Retriever(documents=index_strings)\n", "\n", "# retriever.build_index_from_documents(documents=index_strings)\n", "\n", @@ -2612,7 +2612,7 @@ } ], "source": [ - "retriever = InMemoryBM25Retriever(top_k=1)\n", + "retriever = BM25Retriever(top_k=1)\n", "retriever.build_index_from_documents([\"hello world\", \"world is beautiful\", \"today is a good day\"])\n", "output = retriever.retrieve(\"hello\")\n", "output" @@ -2636,7 +2636,7 @@ "metadata": {}, "outputs": [], "source": [ - "retriever_loaded = InMemoryBM25Retriever.load_from_file(path)" + "retriever_loaded = BM25Retriever.load_from_file(path)" ] }, { diff --git a/developer_notes/tools.ipynb b/developer_notes/tools.ipynb new file mode 100644 index 00000000..3b128ccd --- /dev/null +++ b/developer_notes/tools.ipynb @@ -0,0 +1,2976 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChatCompletion(id='chatcmpl-9epZmBcCQkmy3kvxBBp8aFoaMOROJ', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_3LeBAkryCJcDQ6ZQKkKJUJgr', function=Function(arguments='{\"location\": \"San Francisco, CA\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_cZublzUU6nLPiY2sa8M6vg0L', function=Function(arguments='{\"location\": \"Tokyo, Japan\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_mLffU4B69qExzM8qaeyzkYQd', function=Function(arguments='{\"location\": \"Paris, France\"}', name='get_current_weather'), type='function')]))], created=1719518406, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_ce0793330f', usage=CompletionUsage(completion_tokens=68, prompt_tokens=85, total_tokens=153))\n", + "ChatCompletion(id='chatcmpl-9epZo59ilaWhx9uHAV7cWZG8rWrjb', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Currently, here's the weather in the specified cities:\\n\\n- **San Francisco, CA:** 72°F\\n- **Tokyo, Japan:** 10°C\\n- **Paris, France:** 22°C\", role='assistant', function_call=None, tool_calls=None))], created=1719518408, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_d576307f90', usage=CompletionUsage(completion_tokens=40, prompt_tokens=151, total_tokens=191))\n" + ] + } + ], + "source": [ + "from openai import OpenAI\n", + "import json\n", + "from lightrag.utils import setup_env\n", + "\n", + "client = OpenAI()\n", + "\n", + "# Example dummy function hard coded to return the same weather\n", + "# In production, this could be your backend API or an external API\n", + "def get_current_weather(location, unit=\"fahrenheit\"):\n", + " \"\"\"Get the current weather in a given location\"\"\"\n", + " if \"tokyo\" in location.lower():\n", + " return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": unit})\n", + " elif \"san francisco\" in location.lower():\n", + " return json.dumps({\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": unit})\n", + " elif \"paris\" in location.lower():\n", + " return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": unit})\n", + " else:\n", + " return json.dumps({\"location\": location, \"temperature\": \"unknown\"})\n", + "\n", + "def run_conversation():\n", + " # Step 1: send the conversation and available functions to the model\n", + " messages = [{\"role\": \"user\", \"content\": \"What's the weather like in San Francisco, Tokyo, and Paris?\"}]\n", + " tools = [\n", + " {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_current_weather\",\n", + " \"description\": \"Get the current weather in a given location\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city and state, e.g. San Francisco, CA\",\n", + " },\n", + " \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n", + " },\n", + " \"required\": [\"location\"],\n", + " },\n", + " },\n", + " }\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=messages,\n", + " tools=tools,\n", + " tool_choice=\"auto\", # auto is default, but we'll be explicit\n", + " )\n", + " print(response)\n", + " response_message = response.choices[0].message\n", + " tool_calls = response_message.tool_calls\n", + " # Step 2: check if the model wanted to call a function\n", + " if tool_calls:\n", + " # Step 3: call the function\n", + " # Note: the JSON response may not always be valid; be sure to handle errors\n", + " available_functions = {\n", + " \"get_current_weather\": get_current_weather,\n", + " } # only one function in this example, but you can have multiple\n", + " messages.append(response_message) # extend conversation with assistant's reply\n", + " # Step 4: send the info for each function call and function response to the model\n", + " for tool_call in tool_calls:\n", + " function_name = tool_call.function.name\n", + " function_to_call = available_functions[function_name]\n", + " function_args = json.loads(tool_call.function.arguments)\n", + " function_response = function_to_call(\n", + " location=function_args.get(\"location\"),\n", + " unit=function_args.get(\"unit\"),\n", + " )\n", + " messages.append(\n", + " {\n", + " \"tool_call_id\": tool_call.id,\n", + " \"role\": \"tool\",\n", + " \"name\": function_name,\n", + " \"content\": function_response,\n", + " }\n", + " ) # extend conversation with function response\n", + " second_response = client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=messages,\n", + " ) # get a new response from the model where it can see the function response\n", + " return second_response\n", + "print(run_conversation())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import Any, Dict, List, Tuple\n", + "import numpy as np\n", + "import time\n", + "import asyncio\n", + "\n", + "def multiply(a: int, b: int) -> int:\n", + " \"\"\"Multiply two numbers.\"\"\"\n", + " time.sleep(1)\n", + " return a * b\n", + "\n", + "def add(a: int, b: int) -> int:\n", + " \"\"\"Add two numbers.\"\"\"\n", + " time.sleep(1)\n", + " return a + b\n", + "\n", + "async def divide(a: float, b: float) -> float:\n", + " \"\"\"Divide two numbers.\"\"\"\n", + " await asyncio.sleep(1)\n", + " return float(a) / b\n", + "\n", + "\n", + "async def search(query: str) -> List[str]:\n", + " \"\"\"Search for query and return a list of results.\"\"\"\n", + " await asyncio.sleep(1)\n", + " return [\"result1\" + query, \"result2\" + query]\n", + "\n", + "\n", + "def numpy_sum(arr: np.ndarray) -> float:\n", + " \"\"\"Sum the elements of an array.\"\"\"\n", + " return np.sum(arr)\n", + "\n", + "x = 2\n", + "@dataclass\n", + "class Point:\n", + " x: int\n", + " y: int\n", + "\n", + "def add_points(p1: Point, p2: Point) -> Point:\n", + " return Point(p1.x + p2.x, p1.y + p2.y)\n", + "\n", + "all_functions = [multiply, add, divide, search, numpy_sum, add_points]\n", + "\n", + "all_functions_dict = {f.__name__: f for f in all_functions}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']}))\n", + "FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']}))\n", + "FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']}))\n", + "FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='search', func_desc='search(query: str) -> List[str]\\nSearch for query and return a list of results.', func_parameters={'type': 'object', 'properties': {'query': {'type': 'str'}}, 'required': ['query']}))\n", + "FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='numpy_sum', func_desc='numpy_sum(arr: numpy.ndarray) -> float\\nSum the elements of an array.', func_parameters={'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']}))\n", + "FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add_points', func_desc='add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\\nNone', func_parameters={'type': 'object', 'properties': {'p1': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}, 'p2': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}}, 'required': ['p1', 'p2']}))\n" + ] + } + ], + "source": [ + "# describing the functions\n", + "\n", + "from lightrag.core.func_tool import FunctionTool\n", + "\n", + "functions =[multiply, add, divide, search, numpy_sum, add_points]\n", + "tools = [\n", + " FunctionTool(fn=fn) for fn in functions\n", + "]\n", + "for tool in tools:\n", + " print(tool)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# create a context map\n", + "context_map = {tool.definition.func_name: tool for tool in tools}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'func_name': 'numpy_sum', 'func_desc': 'numpy_sum(arr: numpy.ndarray) -> float\\nSum the elements of an array.', 'func_parameters': {'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']}}\n", + "{\n", + " \"func_name\": \"numpy_sum\",\n", + " \"func_desc\": \"numpy_sum(arr: numpy.ndarray) -> float\\nSum the elements of an array.\",\n", + " \"func_parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"arr\": {\n", + " \"type\": \"ndarray\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"arr\"\n", + " ]\n", + " }\n", + "}\n", + "\"func_name: numpy_sum\\nfunc_desc: 'numpy_sum(arr: numpy.ndarray) -> float\\n\\n Sum the elements of an array.'\\nfunc_parameters:\\n type: object\\n properties:\\n arr:\\n type: ndarray\\n required:\\n - arr\\n\"\n" + ] + } + ], + "source": [ + "print(tools[-2].definition.to_dict())\n", + "\n", + "print(tools[-2].definition.to_json())\n", + "\n", + "print(repr(tools[-2].definition.to_yaml()))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'func_name': 'get_current_weather',\n", + " 'func_desc': \"get_current_weather(location, unit='fahrenheit')\\nGet the current weather in a given location\",\n", + " 'func_parameters': {'type': 'object',\n", + " 'properties': {'location': {'type': 'Any'},\n", + " 'unit': {'type': 'Any', 'default': 'fahrenheit'}},\n", + " 'required': ['location']}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# tool definition for get_current_weather\n", + "\n", + "ft = FunctionTool(fn=get_current_weather)\n", + "ft.definition.to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ToolManager(Tools: [FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='search', func_desc='search(query: str) -> List[str]\\nSearch for query and return a list of results.', func_parameters={'type': 'object', 'properties': {'query': {'type': 'str'}}, 'required': ['query']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='numpy_sum', func_desc='numpy_sum(arr: numpy.ndarray) -> float\\nSum the elements of an array.', func_parameters={'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add_points', func_desc='add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\\nNone', func_parameters={'type': 'object', 'properties': {'p1': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}, 'p2': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}}, 'required': ['p1', 'p2']}))], Additional Context: {})\n" + ] + } + ], + "source": [ + "# to further help us manage the whole process, we will use a tool manager\n", + "\n", + "from lightrag.core.tool_manager import ToolManager\n", + "\n", + "tool_manager = ToolManager(tools=functions)\n", + "print(tool_manager)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FunctionOutput(name='get_current_weather', input=Function(thought=None, name='get_current_weather', args=(), kwargs={'location': 'San Francisco', 'unit': 'celsius'}), parsed_input=None, output='{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"celsius\"}', error=None)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# execute get_current_weather using function call \n", + "\n", + "ft.call(**{\"location\": \"San Francisco\", \"unit\": \"celsius\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">\n" + ] + }, + { + "data": { + "text/plain": [ + "FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 10, 'b': 2}), parsed_input=None, output=5.0, error=None)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 10, 'b': 2}), parsed_input=None, output=5.0, error=None)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# async call\n", + "import nest_asyncio\n", + "from IPython.display import display\n", + "\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "# call it synchronously using execute\n", + "\n", + "print(tools[2].execute(**{\"a\": 10, \"b\": 2}))\n", + "\n", + "display(await tools[2].acall(**{\"a\": 10, \"b\": 2}))\n", + "display(await tools[2].execute(**{\"a\": 10, \"b\": 2}))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_1588/1960468447.py:5: RuntimeWarning: coroutine 'to_thread' was never awaited\n", + " print(tools[1].execute(**{\"a\": 10, \"b\": 2}))\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 10, 'b': 2}), parsed_input=None, output=12, error=None)\n" + ] + }, + { + "data": { + "text/plain": [ + "FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 10, 'b': 2}), parsed_input=None, output=12, error=None)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# run sync func\n", + "\n", + "# in sync way\n", + "\n", + "print(tools[1].execute(**{\"a\": 10, \"b\": 2}))\n", + "print(tools[1].call(**{\"a\": 10, \"b\": 2}))\n", + "\n", + "# in async way\n", + "\n", + "display(await tools[1].execute(**{\"a\": 10, \"b\": 2}))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "run_sync_and_async_mix_without_wait time: 4.006277084350586\n" + ] + }, + { + "data": { + "text/plain": [ + "[>,\n", + " FunctionOutput(name='sync_function_2', input=Function(thought=None, name='sync_function_2', args=(), kwargs={}), parsed_input=None, output='Function 2 completed', error=None),\n", + " FunctionOutput(name='sync_function_2', input=Function(thought=None, name='sync_function_2', args=(), kwargs={}), parsed_input=None, output='Function 2 completed', error=None)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "run_sync_and_async_mix time: 2.0017499923706055\n" + ] + }, + { + "data": { + "text/plain": [ + "[FunctionOutput(name='async_function_1', input=Function(thought=None, name='async_function_1', args=(), kwargs={}), parsed_input=None, output='Function 1 completed', error=None),\n", + " FunctionOutput(name='sync_function_2', input=Function(thought=None, name='sync_function_2', args=(), kwargs={}), parsed_input=None, output='Function 2 completed', error=None),\n", + " FunctionOutput(name='async_function_2', input=Function(thought=None, name='async_function_2', args=(), kwargs={}), parsed_input=None, output='Function 2 completed', error=None)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# call all the above functions \n", + "import nest_asyncio\n", + "import asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "\n", + "import time\n", + "\n", + "async def async_function_1():\n", + " await asyncio.sleep(1)\n", + " return \"Function 1 completed\"\n", + "\n", + "def sync_function_1():\n", + " time.sleep(1)\n", + " return \"Function 1 completed\"\n", + "\n", + "async def async_function_2():\n", + " await asyncio.sleep(2)\n", + " return \"Function 2 completed\"\n", + "\n", + "def sync_function_2():\n", + " time.sleep(2)\n", + " return \"Function 2 completed\"\n", + "\n", + "async_tool_1 = FunctionTool(async_function_1)\n", + "sync_tool_1 = FunctionTool(sync_function_2)\n", + "async_tool_2 = FunctionTool(async_function_2)\n", + "sync_tool_2 = FunctionTool(sync_function_2)\n", + "\n", + "def run_sync_and_async_mix_without_wait():\n", + " # both sync and async tool can use execute\n", + " # sync tool can also use call\n", + " # takes 5 seconds (1+1+2) + overhead\n", + " start_time = time.time()\n", + " results = [\n", + " async_tool_1.execute(),\n", + " sync_tool_1.call(),\n", + " sync_tool_2.call(),\n", + " ]\n", + " end_time = time.time()\n", + " print(f\"run_sync_and_async_mix_without_wait time: {end_time - start_time}\")\n", + " return results\n", + "\n", + "async def run_sync_and_async_mix():\n", + " # both sync and async tool can use execute&to_thread\n", + " # async tool can also use acall without to_thread\n", + " # takes a bit over 2 seconds max(2)\n", + " start_time = time.time()\n", + " results = await asyncio.gather(\n", + " async_tool_1.execute(),\n", + " sync_tool_1.execute(),\n", + " \n", + " async_tool_2.acall(),\n", + " )\n", + " end_time = time.time()\n", + " print(f\"run_sync_and_async_mix time: {end_time - start_time}\")\n", + " return results\n", + "\n", + "# Execute functions\n", + "results_without_wait = run_sync_and_async_mix_without_wait()\n", + "display(results_without_wait)\n", + "\n", + "results_with_wait = asyncio.run(run_sync_and_async_mix())\n", + "display(results_with_wait)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# prepare a template for generator\n", + "template = r\"\"\"You have these tools available:\n", + "{% if tools %}\n", + "\n", + "{% for tool in tools %}\n", + "{{ loop.index }}.\n", + "{{tool}}\n", + "------------------------\n", + "{% endfor %}\n", + "\n", + "{% endif %}\n", + "\n", + "{{output_format_str}}\n", + "\n", + "\n", + "User: {{input_str}}\n", + "You:\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You have these tools available:\n", + "\n", + "1.\n", + "func_name: multiply\n", + "func_desc: 'multiply(a: int, b: int) -> int\n", + "\n", + " Multiply two numbers.'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "2.\n", + "func_name: add\n", + "func_desc: 'add(a: int, b: int) -> int\n", + "\n", + " Add two numbers.'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "3.\n", + "func_name: divide\n", + "func_desc: 'divide(a: float, b: float) -> float\n", + "\n", + " Divide two numbers.'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: float\n", + " b:\n", + " type: float\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "4.\n", + "func_name: search\n", + "func_desc: 'search(query: str) -> List[str]\n", + "\n", + " Search for query and return a list of results.'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " query:\n", + " type: str\n", + " required:\n", + " - query\n", + "\n", + "------------------------\n", + "5.\n", + "func_name: numpy_sum\n", + "func_desc: 'numpy_sum(arr: numpy.ndarray) -> float\n", + "\n", + " Sum the elements of an array.'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " arr:\n", + " type: ndarray\n", + " required:\n", + " - arr\n", + "\n", + "------------------------\n", + "6.\n", + "func_name: add_points\n", + "func_desc: 'add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\n", + "\n", + " None'\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " p1:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " p2:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " required:\n", + " - p1\n", + " - p2\n", + "\n", + "------------------------\n", + "\n", + "\n", + "None\n", + "\n", + "\n", + "User: None\n", + "You:\n", + "\n" + ] + } + ], + "source": [ + "# let's see how the template can be rendered with tools\n", + "from lightrag.core.prompt_builder import Prompt\n", + "\n", + "prompt = Prompt(template=template)\n", + "small_tool_manager = ToolManager(tools=tools[:2])\n", + "\n", + "renered_prompt = prompt(tools=tool_manager.yaml_definitions)\n", + "print(renered_prompt)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You have these tools available:\n", + "\n", + "{\n", + " \"name\": \"The name of the function (str) (optional)\",\n", + " \"args\": \"The positional arguments of the function (Optional) (optional)\",\n", + " \"kwargs\": \"The keyword arguments of the function (Optional) (optional)\"\n", + "}\n", + "\n", + "\n", + "User: None\n", + "You:\n", + "\n" + ] + } + ], + "source": [ + "# let's render the output format using Function class \n", + "\n", + "from lightrag.core.types import Function\n", + "\n", + "\n", + "output_data_class = Function \n", + "output_format_str = output_data_class.to_json_signature(exclude=[\"thought\"])\n", + "\n", + "renered_prompt= prompt(output_format_str=output_format_str)\n", + "print(renered_prompt)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You have these tools available:\n", + "\n", + "{\n", + " \"action\": \"Formatted as FuncName(, ), where FuncName is the function name, are positional arguments, and are keyword arguments in key=value form. Example: 'FuncName(1, b=2)' calls 'FuncName' with positional argument 1 and keyword argument b=2. (str) (required)\"\n", + "}\n", + "\n", + "\n", + "User: None\n", + "You:\n", + "\n" + ] + } + ], + "source": [ + "# use FunctionExpression\n", + "from lightrag.core.types import FunctionExpression\n", + "\n", + "output_data_class = FunctionExpression\n", + "output_format_str = output_data_class.to_json_signature(exclude=[\"thought\"])\n", + "print(prompt(output_format_str=output_format_str))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your output should be formatted as a standard JSON instance with the following schema:\n", + "```\n", + "{\n", + " \"name\": \"The name of the function (str) (optional)\",\n", + " \"args\": \"The positional arguments of the function (Optional) (optional)\",\n", + " \"kwargs\": \"The keyword arguments of the function (Optional) (optional)\"\n", + "}\n", + "```\n", + "-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + "-Use double quotes for the keys and string values.\n", + "-Follow the JSON formatting conventions.\n" + ] + } + ], + "source": [ + "# let's adds more instruction and this time, we will use JsonOutputParser\n", + "\n", + "from lightrag.components.output_parsers import JsonOutputParser\n", + "\n", + "func_parser = JsonOutputParser(data_class=Function)\n", + "instructions = func_parser.format_instructions(exclude=[\"thought\"])\n", + "print(instructions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}, \n", + " (prompt): Prompt(\n", + " template: You have these tools available:\n", + " {% if tools %}\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}.\n", + " {{tool}}\n", + " ------------------------\n", + " {% endfor %}\n", + " \n", + " {% endif %}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " , prompt_kwargs: {'tools': [\"func_name: multiply\\nfunc_desc: 'multiply(a: int, b: int) -> int\\n\\n Multiply two numbers.'\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: int\\n b:\\n type: int\\n required:\\n - a\\n - b\\n\", \"func_name: add\\nfunc_desc: 'add(a: int, b: int) -> int\\n\\n Add two numbers.'\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: int\\n b:\\n type: int\\n required:\\n - a\\n - b\\n\", \"func_name: divide\\nfunc_desc: 'divide(a: float, b: float) -> float\\n\\n Divide two numbers.'\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: float\\n b:\\n type: float\\n required:\\n - a\\n - b\\n\", \"func_name: search\\nfunc_desc: 'search(query: str) -> List[str]\\n\\n Search for query and return a list of results.'\\nfunc_parameters:\\n type: object\\n properties:\\n query:\\n type: str\\n required:\\n - query\\n\", \"func_name: numpy_sum\\nfunc_desc: 'numpy_sum(arr: numpy.ndarray) -> float\\n\\n Sum the elements of an array.'\\nfunc_parameters:\\n type: object\\n properties:\\n arr:\\n type: ndarray\\n required:\\n - arr\\n\", \"func_name: add_points\\nfunc_desc: 'add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\\n\\n None'\\nfunc_parameters:\\n type: object\\n properties:\\n p1:\\n type: Point\\n properties:\\n x:\\n type: int\\n y:\\n type: int\\n required:\\n - x\\n - y\\n p2:\\n type: Point\\n properties:\\n x:\\n type: int\\n y:\\n type: int\\n required:\\n - x\\n - y\\n required:\\n - p1\\n - p2\\n\"], 'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n \"name\": \"The name of the function (str) (optional)\",\\n \"kwargs\": \"The keyword arguments of the function (Optional) (optional)\"\\n}\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['output_format_str', 'tools', 'input_str']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): JsonOutputParser(\n", + " data_class_for_json=\n", + " (json_output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard JSON instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " {% if example %}\n", + " Here is an example:\n", + " ```\n", + " {{example}}\n", + " ```\n", + " {% endif %}\n", + " -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + " -Use double quotes for the keys and string values.\n", + " -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']\n", + " )\n", + " (output_processors): JsonParser()\n", + " )\n", + ")" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create the generator\n", + "from lightrag.core.generator import Generator\n", + "from lightrag.core.types import ModelClientType\n", + "\n", + "model_kwargs = {\"model\": \"gpt-3.5-turbo\"}\n", + "prompt_kwargs = {\n", + " \"tools\": tool_manager.yaml_definitions,\n", + " \"output_format_str\": func_parser.format_instructions(\n", + " exclude=[\"thought\", \"args\"]\n", + " ),\n", + "}\n", + "generator = Generator(\n", + " model_client=ModelClientType.OPENAI(),\n", + " model_kwargs=model_kwargs,\n", + " template=template,\n", + " prompt_kwargs=prompt_kwargs,\n", + " output_processors=func_parser,\n", + ")\n", + "generator" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arr = np.array([[1, 2], [3, 4]])\n", + "numpy_sum(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# call queries\n", + "queries = [\n", + " \"add 2 and 3\",\n", + " \"search for something\",\n", + " \"add points (1, 2) and (3, 4)\",\n", + " \"sum numpy array with arr = np.array([[1, 2], [3, 4]])\",\n", + " \"multiply 2 with local variable x\",\n", + " \"divide 2 by 3\",\n", + " \"Add 5 to variable y\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "0 Query: add 2 and 3\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='add', args=[], kwargs={'a': 2, 'b': 3})\n" + ] + }, + { + "data": { + "text/plain": [ + "'Function output: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "1 Query: search for something\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='search', args=[], kwargs={'query': 'something'})\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_1588/3588443470.py:10: RuntimeWarning: coroutine 'to_thread' was never awaited\n", + " func_output= tool_manager.execute_func(func)\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + }, + { + "data": { + "text/plain": [ + "\"Function output: >\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2 Query: add points (1, 2) and (3, 4)\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='add_points', args=[], kwargs={'p1': {'x': 1, 'y': 2}, 'p2': {'x': 3, 'y': 4}})\n" + ] + }, + { + "data": { + "text/plain": [ + "'Function output: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "3 Query: sum numpy array with arr = np.array([[1, 2], [3, 4]])\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='numpy_sum', args=[], kwargs={'arr': [[1, 2], [3, 4]]})\n" + ] + }, + { + "data": { + "text/plain": [ + "'Function output: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "4 Query: multiply 2 with local variable x\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='multiply', args=[], kwargs={'a': 2, 'b': 'x'})\n" + ] + }, + { + "data": { + "text/plain": [ + "'Function output: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "5 Query: divide 2 by 3\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='divide', args=[], kwargs={'a': 2.0, 'b': 3.0})\n" + ] + }, + { + "data": { + "text/plain": [ + "\"Function output: >\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "6 Query: Add 5 to variable y\n", + "--------------------------------------------------\n", + "Function: Function(thought=None, name='add', args=[], kwargs={'a': 5, 'b': 10})\n" + ] + }, + { + "data": { + "text/plain": [ + "'Function output: '" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "for idx, query in enumerate(queries):\n", + " prompt_kwargs = {\"input_str\": query}\n", + " print(f\"\\n{idx} Query: {query}\")\n", + " print(f\"{'-'*50}\")\n", + " try:\n", + " result = generator(prompt_kwargs=prompt_kwargs)\n", + " # print(f\"LLM raw output: {result.raw_response}\")\n", + " func = Function.from_dict(result.data)\n", + " print(f\"Function: {func}\")\n", + " func_output= tool_manager.execute_func(func)\n", + " display(f\"Function output: {func_output}\")\n", + " except Exception as e:\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Problems with Function directly:\n", + "1. difficult to support data types. Unless to update the function to use dict version of the data types to do it.\n", + "\n", + "```python\n", + "def add_points(p1: dict, p2: dict) -> dict:\n", + " p1 = Point(**p1)\n", + " p2 = Point(**p2)\n", + " return add_points_tool.fn(p1, p2).__dict__\n", + "```\n", + "2. difficult to use variable as arguments. [TODO: find a proper way to demonstrate it]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "action: Formatted as FuncName(, ), where FuncName is the function name, are positional arguments, and are keyword arguments in key=value form. Example: 'FuncName(1, b=2)' calls 'FuncName' with positional argument 1 and keyword argument b=2. (str) (required)\n", + "FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))')\n" + ] + } + ], + "source": [ + "# let's use FunctionExpression to call the function instead \n", + "\n", + "from lightrag.core.types import FunctionExpression\n", + "\n", + "output_data_class = FunctionExpression\n", + "output_format_str = output_data_class.to_yaml_signature(exclude=[\"thought\"])\n", + "print(output_format_str)\n", + "\n", + "# lets' add one example to be more robust that they should call it with function call expression\n", + "example = FunctionExpression.from_function(thought=None, func=add_points, **{\"p1\": Point(1, 2), \"p2\": Point(3, 4)})\n", + "print(example)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt:\n", + "\n", + "You have these tools available:\n", + "\n", + "1.\n", + "func_name: multiply\n", + "func_desc: Multiply two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "2.\n", + "func_name: add\n", + "func_desc: Add two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "3.\n", + "func_name: divide\n", + "func_desc: Divide two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: float\n", + " b:\n", + " type: float\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "4.\n", + "func_name: search\n", + "func_desc: Search for query and return a list of results.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " query:\n", + " type: str\n", + " required:\n", + " - query\n", + "\n", + "------------------------\n", + "5.\n", + "func_name: numpy_sum\n", + "func_desc: Sum the elements of an array.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " arr:\n", + " type: ndarray\n", + " required:\n", + " - arr\n", + "\n", + "------------------------\n", + "6.\n", + "func_name: add_points\n", + "func_desc: None\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " p1:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " p2:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " required:\n", + " - p1\n", + " - p2\n", + "\n", + "------------------------\n", + "\n", + "\n", + "Your output should be formatted as a standard JSON instance with the following schema:\n", + "```\n", + "{\n", + " \"action\": \"Formatted as FuncName(, ), where FuncName is the function name, are positional arguments, and are keyword arguments in key=value form. Example: 'FuncName(1, b=2)' calls 'FuncName' with positional argument 1 and keyword argument b=2. (str) (required)\"\n", + "}\n", + "```\n", + "Here is an example:\n", + "```\n", + "{\n", + " \"action\": \"add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\"\n", + "}\n", + "```\n", + "-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + "-Use double quotes for the keys and string values.\n", + "-Follow the JSON formatting conventions.\n", + "\n", + "\n", + "User: None\n", + "You:\n", + "\n" + ] + } + ], + "source": [ + "# also use json output parser and create a new generator\n", + "\n", + "parser = JsonOutputParser(data_class=FunctionExpression, example=example)\n", + "instructions = parser.format_instructions(exclude=[\"thought\"])\n", + "\n", + "prompt_kwargs = {\n", + " \"tools\": [tool.definition.to_yaml() for tool in tools],\n", + " \"output_format_str\": parser.format_instructions(exclude=[\"thought\"]),\n", + " }\n", + "generator = Generator(\n", + " model_client=ModelClientType.OPENAI(),\n", + " model_kwargs=model_kwargs,\n", + " template=template,\n", + " prompt_kwargs=prompt_kwargs,\n", + " output_processors=parser\n", + ")\n", + "\n", + "generator.print_prompt(**prompt_kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Errpr at sandbox_exec: invalid syntax (, line 2)\n", + "Sandbox error: invalid syntax (, line 2)\n" + ] + } + ], + "source": [ + "import ast\n", + "import builtins\n", + "import contextlib\n", + "import ctypes\n", + "import sys\n", + "import threading\n", + "import time\n", + "\n", + "# Define a list of safe built-ins\n", + "SAFE_BUILTINS = {\n", + " 'abs': abs,\n", + " 'all': all,\n", + " 'any': any,\n", + " 'bin': bin,\n", + " 'bool': bool,\n", + " 'bytearray': bytearray,\n", + " 'bytes': bytes,\n", + " 'callable': callable,\n", + " 'chr': chr,\n", + " 'complex': complex,\n", + " 'dict': dict,\n", + " 'divmod': divmod,\n", + " 'enumerate': enumerate,\n", + " 'filter': filter,\n", + " 'float': float,\n", + " 'format': format,\n", + " 'frozenset': frozenset,\n", + " 'getattr': getattr,\n", + " 'hasattr': hasattr,\n", + " 'hash': hash,\n", + " 'hex': hex,\n", + " 'int': int,\n", + " 'isinstance': isinstance,\n", + " 'issubclass': issubclass,\n", + " 'iter': iter,\n", + " 'len': len,\n", + " 'list': list,\n", + " 'map': map,\n", + " 'max': max,\n", + " 'min': min,\n", + " 'next': next,\n", + " 'object': object,\n", + " 'oct': oct,\n", + " 'ord': ord,\n", + " 'pow': pow,\n", + " 'range': range,\n", + " 'repr': repr,\n", + " 'reversed': reversed,\n", + " 'round': round,\n", + " 'set': set,\n", + " 'slice': slice,\n", + " 'sorted': sorted,\n", + " 'str': str,\n", + " 'sum': sum,\n", + " 'tuple': tuple,\n", + " 'type': type,\n", + " 'zip': zip,\n", + "}\n", + "\n", + "# Define a context manager to limit execution time\n", + "# Create a sandbox execution function\n", + "def sandbox_exec(code, context=SAFE_BUILTINS, timeout=5):\n", + "\n", + " try:\n", + " compiled_code = compile(code, '', 'exec')\n", + "\n", + " # Result dictionary to store execution results\n", + " result = {\n", + " \"output\" : None,\n", + " \"error\" : None\n", + " }\n", + "\n", + " # Define a target function for the thread\n", + " def target():\n", + " try:\n", + " # Execute the code\n", + " exec(compiled_code, context, result)\n", + " except Exception as e:\n", + " result[\"error\"] = e\n", + " \n", + "\n", + " # Create a thread to execute the code\n", + " thread = threading.Thread(target=target)\n", + " thread.start()\n", + " thread.join(timeout)\n", + "\n", + " # Check if the thread is still alive (timed out)\n", + " if thread.is_alive():\n", + " result[\"error\"] = TimeoutError(\"Execution timed out\")\n", + " raise TimeoutError(\"Execution timed out\")\n", + " except Exception as e:\n", + " print(f\"Errpr at sandbox_exec: {e}\")\n", + " raise e\n", + "\n", + " return result\n", + "\n", + "# Example usage\n", + "code = \"\"\"\n", + "def add(a, b+5):\n", + " return a + b\n", + "\n", + "output = add(1, 2+y)\n", + "\"\"\"\n", + "\n", + "try:\n", + " result = sandbox_exec(code)\n", + " print(\"Sandbox output:\", result)\n", + "except TimeoutError as e:\n", + " print(e)\n", + "except Exception as e:\n", + " print(\"Sandbox error:\", e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'multiply': , 'add': , 'divide': , 'search': , 'numpy_sum': , 'add_points': , 'Point': , 'np': , 'np.ndarray': , 'array': , 'arr': array([[1, 2],\n", + " [3, 4]]), 'np.array': , 'x': 2}\n", + "Query: add 2 and 3\n", + "GeneratorOutput(data={'action': 'add(2, b=3)'}, error=None, usage=None, raw_response='{\\n \"action\": \"add(2, b=3)\"\\n}')\n", + "FunctionExpression(thought=None, action='add(2, b=3)')\n", + "Function(thought=None, name='add', args=[2], kwargs={'b': 3})\n", + "func output: 5\n", + "func expr: add(2, b=3)\n", + "func output: 5\n", + "sandbox output: {'output': 5, 'error': None}\n", + "Query: search for something\n", + "GeneratorOutput(data={'action': \"search('something')\"}, error=None, usage=None, raw_response='{\\n \"action\": \"search(\\'something\\')\"\\n}')\n", + "FunctionExpression(thought=None, action=\"search('something')\")\n", + "Function(thought=None, name='search', args=['something'], kwargs={})\n", + "func output: \n", + "func expr: search('something')\n", + "func output: \n", + "sandbox output: {'output': , 'error': None}\n", + "Query: add points (1, 2) and (3, 4)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_30203/1940296491.py:40: RuntimeWarning: coroutine 'search' was never awaited\n", + " fun_output = eval(func_expr.action)\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n", + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_30203/1940296491.py:22: RuntimeWarning: coroutine 'search' was never awaited\n", + " result = generator(prompt_kwargs=prompt_kwargs)\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n", + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_30203/1940296491.py:33: RuntimeWarning: coroutine 'search' was never awaited\n", + " fun_output = all_functions_dict[func.name](*func.args, **func.kwargs)\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data={'action': 'add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'}, error=None, usage=None, raw_response='```\\n{\\n \"action\": \"add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\"\\n}\\n```')\n", + "FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))')\n", + "Function(thought=None, name='add_points', args=[], kwargs={'p1': Point(x=1, y=2), 'p2': Point(x=3, y=4)})\n", + "func output: Point(x=4, y=6)\n", + "func expr: add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\n", + "func output: Point(x=4, y=6)\n", + "sandbox output: {'output': Point(x=4, y=6), 'error': None}\n", + "Query: sum numpy array with arr = np.array([[1, 2], [3, 4]])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data={'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}, error=None, usage=None, raw_response='{\\n \"action\": \"numpy_sum(arr=np.array([[1, 2], [3, 4]]))\"\\n}')\n", + "FunctionExpression(thought=None, action='numpy_sum(arr=np.array([1, 2], [3, 4]))')\n", + "Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n", + "Failed to execute the function for query: sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: {'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}, error: Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n", + "Field elements must be 2- or 3-tuples, got '3'\n", + "Failed to execute the function for query: sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: {'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}, error: Field elements must be 2- or 3-tuples, got '3'\n", + "Query: multiply 2 with local variable x\n", + "GeneratorOutput(data={'action': 'multiply(2, b=x)'}, error=None, usage=None, raw_response='{\\n \"action\": \"multiply(2, b=x)\"\\n}')\n", + "FunctionExpression(thought=None, action='multiply(2, b=x)')\n", + "Function(thought=None, name='multiply', args=[2], kwargs={'b': 2})\n", + "func output: 4\n", + "func expr: multiply(2, b=x)\n", + "func output: 4\n", + "sandbox output: {'output': 4, 'error': None}\n", + "Query: divide 2 by 3\n", + "GeneratorOutput(data={'action': 'divide(2.0, b=3.0)'}, error=None, usage=None, raw_response='{\\n \"action\": \"divide(2.0, b=3.0)\"\\n}')\n", + "FunctionExpression(thought=None, action='divide(2.0, b=3.0)')\n", + "Function(thought=None, name='divide', args=[2.0], kwargs={'b': 3.0})\n", + "func output: 0.6666666666666666\n", + "func expr: divide(2.0, b=3.0)\n", + "func output: 0.6666666666666666\n", + "sandbox output: {'output': 0.6666666666666666, 'error': None}\n", + "Query: Add 5 to variable y\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error Error: 'y', y does not exist in the context_map. parsing function call expression: add(a=5, b=y)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data={'action': 'add(a=5, b=y)'}, error=None, usage=None, raw_response='{\\n \"action\": \"add(a=5, b=y)\"\\n}')\n", + "FunctionExpression(thought=None, action='add(a=5, b=y)')\n", + "Error Error: 'y', y does not exist in the context_map. parsing function call expression: add(a=5, b=y)\n", + "Failed to execute the function for query: Add 5 to variable y, func: {'action': 'add(a=5, b=y)'}, error: Error Error: 'y', y does not exist in the context_map. parsing function call expression: add(a=5, b=y)\n", + "func output: 9\n", + "sandbox output: {'output': None, 'error': NameError(\"name 'y' is not defined\")}\n" + ] + } + ], + "source": [ + "# run the generator but we will use FunctionTool.parse_function_call_expr and have a context map \n", + "\n", + "all_functions_dict.update(\n", + " {\n", + " \"Point\": Point,\n", + " # support numpy\n", + " \"np\": np,\n", + " \"np.ndarray\": np.ndarray,\n", + " \"array\": np.array,\n", + " \"arr\": arr,\n", + " \"np.array\": np.array,\n", + " \"x\": x\n", + " }\n", + ")\n", + "y=4\n", + "print(all_functions_dict)\n", + "for query in queries+[\"Add 5 to variable y\"]:\n", + "\n", + " try:\n", + " print(f\"Query: {query}\")\n", + " prompt_kwargs = {\"input_str\": query}\n", + " result = generator(prompt_kwargs=prompt_kwargs)\n", + " print(result)\n", + "\n", + " func_expr = FunctionExpression.from_dict(result.data)\n", + "\n", + " print(func_expr)\n", + " assert isinstance(func_expr, FunctionExpression), f\"Expected FunctionExpression, got {type(result.data)}\"\n", + "\n", + " # more secure way to handle function call\n", + " func: Function = FunctionTool.parse_function_call_expr(expr=func_expr.action, context_map=all_functions_dict)\n", + " print(func)\n", + " fun_output = all_functions_dict[func.name](*func.args, **func.kwargs)\n", + " print(\"func output:\", fun_output)\n", + "\n", + " print(f\"func expr: {func_expr.action}\")\n", + "\n", + " # eval without security check by using eval directly\n", + " # less secure but even more powerful and flexible\n", + " fun_output = eval(func_expr.action)\n", + " print(\"func output:\", fun_output)\n", + "\n", + " # sandbox_exec\n", + " action = \"output=\" + func_expr.action\n", + " result = sandbox_exec(action, context={**SAFE_BUILTINS, **all_functions_dict})\n", + " print(\"sandbox output:\", result)\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")\n", + " try:\n", + " fun_output = eval(func_expr.action)\n", + " print(\"func output:\", fun_output)\n", + "\n", + " #sandbox_exec\n", + " action = \"output=\" + func_expr.action\n", + " result = sandbox_exec(action, context={**SAFE_BUILTINS, **all_functions_dict})\n", + " print(\"sandbox output:\", result)\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Multiple function calls" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "multple_function_call_template = r\"\"\"You have these tools available:\n", + "{% if tools %}\n", + "\n", + "{% for tool in tools %}\n", + "{{ loop.index }}.\n", + "{{tool}}\n", + "------------------------\n", + "{% endfor %}\n", + "\n", + "{% endif %}\n", + "\n", + "Here is how you call one function.\n", + "{{output_format_str}}\n", + "Return a List using `[]` of the above JSON objects. You can have length of 1 or more.\n", + "Do not call multiple functions in one action field.\n", + "\n", + "\n", + "{{input_str}}\n", + "You:\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo'}, \n", + " (prompt): Prompt(\n", + " template: You have these tools available:\n", + " {% if tools %}\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}.\n", + " {{tool}}\n", + " ------------------------\n", + " {% endfor %}\n", + " \n", + " {% endif %}\n", + " \n", + " Here is how you call one function.\n", + " {{output_format_str}}\n", + " Return a List using `[]` of the above JSON objects. You can have length of 1 or more.\n", + " Do not call multiple functions in one action field.\n", + " \n", + " \n", + " {{input_str}}\n", + " You:\n", + " , prompt_kwargs: {'tools': ['func_name: multiply\\nfunc_desc: Multiply two numbers.\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: int\\n b:\\n type: int\\n required:\\n - a\\n - b\\n', 'func_name: add\\nfunc_desc: Add two numbers.\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: int\\n b:\\n type: int\\n required:\\n - a\\n - b\\n', 'func_name: divide\\nfunc_desc: Divide two numbers.\\nfunc_parameters:\\n type: object\\n properties:\\n a:\\n type: float\\n b:\\n type: float\\n required:\\n - a\\n - b\\n', 'func_name: search\\nfunc_desc: Search for query and return a list of results.\\nfunc_parameters:\\n type: object\\n properties:\\n query:\\n type: str\\n required:\\n - query\\n', 'func_name: numpy_sum\\nfunc_desc: Sum the elements of an array.\\nfunc_parameters:\\n type: object\\n properties:\\n arr:\\n type: ndarray\\n required:\\n - arr\\n', 'func_name: add_points\\nfunc_desc: None\\nfunc_parameters:\\n type: object\\n properties:\\n p1:\\n type: Point\\n properties:\\n x:\\n type: int\\n y:\\n type: int\\n required:\\n - x\\n - y\\n p2:\\n type: Point\\n properties:\\n x:\\n type: int\\n y:\\n type: int\\n required:\\n - x\\n - y\\n required:\\n - p1\\n - p2\\n'], 'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n \"action\": \"Formatted as FuncName(, ), where FuncName is the function name, are positional arguments, and are keyword arguments in key=value form. Example: \\'FuncName(1, b=2)\\' calls \\'FuncName\\' with positional argument 1 and keyword argument b=2. (str) (required)\"\\n}\\n```\\nHere is an example:\\n```\\n{\\n \"action\": \"add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\"\\n}\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['output_format_str', 'input_str', 'tools']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): JsonParser()\n", + ")\n", + "Prompt:\n", + "\n", + "You have these tools available:\n", + "\n", + "1.\n", + "func_name: multiply\n", + "func_desc: Multiply two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "2.\n", + "func_name: add\n", + "func_desc: Add two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: int\n", + " b:\n", + " type: int\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "3.\n", + "func_name: divide\n", + "func_desc: Divide two numbers.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " a:\n", + " type: float\n", + " b:\n", + " type: float\n", + " required:\n", + " - a\n", + " - b\n", + "\n", + "------------------------\n", + "4.\n", + "func_name: search\n", + "func_desc: Search for query and return a list of results.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " query:\n", + " type: str\n", + " required:\n", + " - query\n", + "\n", + "------------------------\n", + "5.\n", + "func_name: numpy_sum\n", + "func_desc: Sum the elements of an array.\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " arr:\n", + " type: ndarray\n", + " required:\n", + " - arr\n", + "\n", + "------------------------\n", + "6.\n", + "func_name: add_points\n", + "func_desc: None\n", + "func_parameters:\n", + " type: object\n", + " properties:\n", + " p1:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " p2:\n", + " type: Point\n", + " properties:\n", + " x:\n", + " type: int\n", + " y:\n", + " type: int\n", + " required:\n", + " - x\n", + " - y\n", + " required:\n", + " - p1\n", + " - p2\n", + "\n", + "------------------------\n", + "\n", + "\n", + "Here is how you call one function.\n", + "Your output should be formatted as a standard JSON instance with the following schema:\n", + "```\n", + "{\n", + " \"action\": \"Formatted as FuncName(, ), where FuncName is the function name, are positional arguments, and are keyword arguments in key=value form. Example: 'FuncName(1, b=2)' calls 'FuncName' with positional argument 1 and keyword argument b=2. (str) (required)\"\n", + "}\n", + "```\n", + "Here is an example:\n", + "```\n", + "{\n", + " \"action\": \"add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\"\n", + "}\n", + "```\n", + "-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + "-Use double quotes for the keys and string values.\n", + "-Follow the JSON formatting conventions.\n", + "Return a List using `[]` of the above JSON objects. You can have length of 1 or more.\n", + "Do not call multiple functions in one action field.\n", + "\n", + "\n", + "None\n", + "You:\n", + "\n" + ] + } + ], + "source": [ + "queries = [\"add 2 and 3\", \"search for something\", \"add points (1, 2) and (3, 4)\", \"sum numpy array with arr = np.array([[1, 2], [3, 4]])\", \"multiply 2 with local variable x\", \"divide 2 by 3\"]\n", + "\n", + "from lightrag.components.output_parsers import ListOutputParser\n", + "from lightrag.core.string_parser import JsonParser # improve a list of json\n", + "\n", + "preset_prompt_kwargs = {\n", + " \"tools\": [tool.definition.to_yaml() for tool in tools],\n", + " \"output_format_str\": parser.format_instructions(exclude=[\"thought\"])\n", + " }\n", + "multi_call_gen = Generator(\n", + " model_client=ModelClientType.OPENAI(),\n", + " model_kwargs=model_kwargs,\n", + " template=multple_function_call_template,\n", + " prompt_kwargs=preset_prompt_kwargs,\n", + " output_processors=JsonParser()\n", + ")\n", + "print(multi_call_gen)\n", + "multi_call_gen.print_prompt()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: add 2 and 3 and search for something\n", + "_________________________\n", + "\n", + "GeneratorOutput(data={'action': 'add(2, b=3)'}, error=None, usage=None, raw_response='``` \\n{\\n \"action\": \"add(2, b=3)\"\\n}\\n{\\n \"action\": \"search(query=\\'something\\')\"\\n}\\n```')\n", + "'str' object has no attribute 'items'\n", + "Failed to parse the function for query: add 2 and 3 and search for something, func: {'action': 'add(2, b=3)'}, error: 'str' object has no attribute 'items'\n", + "Query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]])\n", + "_________________________\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data=[{'action': 'add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'}, {'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}], error=None, usage=None, raw_response='```json\\n[\\n {\\n \"action\": \"add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\"\\n },\\n {\\n \"action\": \"numpy_sum(arr=np.array([[1, 2], [3, 4]]))\"\\n }\\n]\\n```')\n", + "[FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'), FunctionExpression(thought=None, action='numpy_sum(arr=np.array([1, 2], [3, 4]))')]\n", + "Function(thought=None, name='add_points', args=[], kwargs={'p1': Point(x=1, y=2), 'p2': Point(x=3, y=4)})\n", + "func output: Point(x=4, y=6)\n", + "Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n", + "Failed to execute the function for query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: [{'action': 'add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'}, {'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}], error: Error Field elements must be 2- or 3-tuples, got '3' parsing function call expression: numpy_sum(arr=np.array([1, 2], [3, 4]))\n", + "func expr: add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))\n", + "func output: Point(x=4, y=6)\n", + "func expr: numpy_sum(arr=np.array([1, 2], [3, 4]))\n", + "Field elements must be 2- or 3-tuples, got '3'\n", + "Failed to execute the function for query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: [{'action': 'add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'}, {'action': 'numpy_sum(arr=np.array([1, 2], [3, 4]))'}], error: Field elements must be 2- or 3-tuples, got '3'\n", + "sandbox output: {'output': Point(x=4, y=6), 'error': None}\n", + "sandbox output: {'output': None, 'error': TypeError(\"Field elements must be 2- or 3-tuples, got '3'\")}\n", + "fun_output by sandbox: [{'output': Point(x=4, y=6), 'error': None}, {'output': None, 'error': TypeError(\"Field elements must be 2- or 3-tuples, got '3'\")}]\n", + "_________________________\n", + "\n", + "Query: multiply 2 with local variable x and divide 2 by 3\n", + "_________________________\n", + "\n", + "GeneratorOutput(data=[{'action': 'multiply(2, b=x)'}, {'action': 'divide(a=2, b=3)'}], error=None, usage=None, raw_response='```json\\n[\\n {\\n \"action\": \"multiply(2, b=x)\"\\n },\\n {\\n \"action\": \"divide(a=2, b=3)\"\\n }\\n]\\n```')\n", + "[FunctionExpression(thought=None, action='multiply(2, b=x)'), FunctionExpression(thought=None, action='divide(a=2, b=3)')]\n", + "Function(thought=None, name='multiply', args=[2], kwargs={'b': 2})\n", + "func output: 4\n", + "Function(thought=None, name='divide', args=[], kwargs={'a': 2, 'b': 3})\n", + "func output: 0.6666666666666666\n", + "fun_output by parsing: [4, 0.6666666666666666]\n", + "_________________________\n", + "\n", + "func expr: multiply(2, b=x)\n", + "func output: 4\n", + "func expr: divide(a=2, b=3)\n", + "func output: 0.6666666666666666\n", + "fun_output by eval: [4, 0.6666666666666666]\n", + "_________________________\n", + "\n", + "sandbox output: {'output': 4, 'error': None}\n", + "sandbox output: {'output': 0.6666666666666666, 'error': None}\n", + "fun_output by sandbox: [{'output': 4, 'error': None}, {'output': 0.6666666666666666, 'error': None}]\n", + "_________________________\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_30203/2050537243.py:47: RuntimeWarning: coroutine 'search' was never awaited\n", + " func_outputs_1 = [execute_function_by_parsing(func_expr, all_functions_dict) for func_expr in func_exprs]\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n", + "/var/folders/h8/nhgbdr4d18x2r49j4pk5z6gw0000gn/T/ipykernel_30203/2050537243.py:55: RuntimeWarning: coroutine 'search' was never awaited\n", + " func_outputs_2 = [execute_function_by_eval(func_expr) for func_expr in func_exprs]\n", + "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n" + ] + } + ], + "source": [ + "def execute_function_by_parsing(func_expr: FunctionExpression, all_functions_dict: Dict[str, Any]) -> Any:\n", + " func: Function = FunctionTool.parse_function_call_expr(expr=func_expr.action, context_map=all_functions_dict)\n", + " print(func)\n", + " fun_output = all_functions_dict[func.name](*func.args, **func.kwargs)\n", + " print(\"func output:\", fun_output)\n", + " return fun_output\n", + "\n", + "\n", + "def execute_function_by_eval(func_expr: FunctionExpression) -> Any:\n", + "\n", + " print(f\"func expr: {func_expr.action}\")\n", + "\n", + " # eval without security check by using eval directly\n", + " # less secure but even more powerful and flexible\n", + " fun_output = eval(func_expr.action)\n", + " print(\"func output:\", fun_output)\n", + " return fun_output\n", + "\n", + "def execute_function_by_sandbox(func_expr: FunctionExpression, all_functions_dict: Dict[str, Any]) -> Any:\n", + " # sandbox_exec\n", + " action = \"output=\" + func_expr.action\n", + " result = sandbox_exec(action, context={**SAFE_BUILTINS, **all_functions_dict})\n", + " print(\"sandbox output:\", result)\n", + "\n", + " return result\n", + "\n", + "\n", + "\n", + "\n", + "for i in range(0, len(queries), 2):\n", + " query = \" and \".join(queries[i:i+2])\n", + " print(f\"Query: {query}\\n_________________________\\n\")\n", + " prompt_kwargs = {\"input_str\": query}\n", + " result = multi_call_gen(prompt_kwargs=prompt_kwargs)\n", + " print(result)\n", + "\n", + " try:\n", + "\n", + " func_exprs = [FunctionExpression.from_dict(item) for item in result.data]\n", + "\n", + " print(func_exprs)\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to parse the function for query: {query}, func: {result.data}, error: {e}\")\n", + " continue\n", + " try:\n", + " func_outputs_1 = [execute_function_by_parsing(func_expr, all_functions_dict) for func_expr in func_exprs]\n", + " print(f\"fun_output by parsing: {func_outputs_1}\\n_________________________\\n\")\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")\n", + "\n", + " try:\n", + "\n", + " func_outputs_2 = [execute_function_by_eval(func_expr) for func_expr in func_exprs]\n", + " print(f\"fun_output by eval: {func_outputs_2}\\n_________________________\\n\")\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")\n", + "\n", + " try:\n", + "\n", + " func_outputs_3 = [execute_function_by_sandbox(func_expr, all_functions_dict) for func_expr in func_exprs]\n", + " print(f\"fun_output by sandbox: {func_outputs_3}\\n_________________________\\n\")\n", + " except Exception as e:\n", + " print(e)\n", + " print(f\"Failed to execute the function for query: {query}, func: {result.data}, error: {e}\")\n", + "\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "response: ChatCompletion(id='chatcmpl-9eDBpPnQkSDM90VqKgmtGsMJ3k7jJ', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0f6vJCTGXHRDlr3Uns6cczlJ', function=Function(arguments='{\"location\": \"San Francisco, CA\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_GZwXUQ2hVeLmOuf6Ty28JluG', function=Function(arguments='{\"location\": \"Tokyo, Japan\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_ASnPn2pGcyixg7AjCr4DSdYs', function=Function(arguments='{\"location\": \"Paris, France\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function')]))], created=1719370849, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_4008e3b719', usage=CompletionUsage(completion_tokens=83, prompt_tokens=88, total_tokens=171))\n", + "tool_calls: [ChatCompletionMessageToolCall(id='call_0f6vJCTGXHRDlr3Uns6cczlJ', function=Function(arguments='{\"location\": \"San Francisco, CA\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_GZwXUQ2hVeLmOuf6Ty28JluG', function=Function(arguments='{\"location\": \"Tokyo, Japan\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_ASnPn2pGcyixg7AjCr4DSdYs', function=Function(arguments='{\"location\": \"Paris, France\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function')]\n", + "ChatCompletion(id='chatcmpl-9eDBq0oXMtJYAB9TOfAqm9dNNQoJf', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are the current temperatures in Celsius:\\n\\n- **San Francisco, CA:** \\\\(72^\\\\circ \\\\text{C}\\\\)\\n- **Tokyo, Japan:** \\\\(10^\\\\circ \\\\text{C}\\\\)\\n- **Paris, France:** \\\\(22^\\\\circ \\\\text{C}\\\\)\\n\\nLet me know if you need any more information!', role='assistant', function_call=None, tool_calls=None))], created=1719370850, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_d576307f90', usage=CompletionUsage(completion_tokens=67, prompt_tokens=176, total_tokens=243))\n" + ] + } + ], + "source": [ + "# first check the openai's function call apis\n", + "\n", + "from openai import OpenAI\n", + "from openai.types import FunctionDefinition\n", + "from lightrag.utils import setup_env\n", + "import json\n", + "\n", + "client = OpenAI()\n", + "\n", + "# Example dummy function hard coded to return the same weather\n", + "# In production, this could be your backend API or an external API\n", + "def get_current_weather(location, unit=\"fahrenheit\"):\n", + " \"\"\"Get the current weather in a given location\"\"\"\n", + " if \"tokyo\" in location.lower():\n", + " return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": unit})\n", + " elif \"san francisco\" in location.lower():\n", + " return json.dumps({\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": unit})\n", + " elif \"paris\" in location.lower():\n", + " return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": unit})\n", + " else:\n", + " return json.dumps({\"location\": location, \"temperature\": \"unknown\"})\n", + "\n", + "def run_conversation():\n", + " # Step 1: send the conversation and available functions to the model\n", + " messages = [{\"role\": \"user\", \"content\": \"What's the weather like in San Francisco, Tokyo, and Paris in celsius?\"}]\n", + " tools = [\n", + " {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_current_weather\",\n", + " \"description\": \"Get the current weather in a given location\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city and state, e.g. San Francisco, CA\",\n", + " },\n", + " \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n", + " },\n", + " \"required\": [\"location\"],\n", + " },\n", + " },\n", + " }\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=messages,\n", + " tools=tools,\n", + " tool_choice=\"auto\", # auto is default, but we'll be explicit\n", + " )\n", + " print(f\"response: {response}\")\n", + " response_message = response.choices[0].message\n", + " tool_calls = response_message.tool_calls\n", + "\n", + " print(f\"tool_calls: {tool_calls}\")\n", + " # Step 2: check if the model wanted to call a function\n", + " if tool_calls:\n", + " # Step 3: call the function\n", + " # Note: the JSON response may not always be valid; be sure to handle errors\n", + " available_functions = {\n", + " \"get_current_weather\": get_current_weather,\n", + " } # only one function in this example, but you can have multiple\n", + " messages.append(response_message) # extend conversation with assistant's reply\n", + " # Step 4: send the info for each function call and function response to the model\n", + " for tool_call in tool_calls:\n", + " function_name = tool_call.function.name\n", + " function_to_call = available_functions[function_name]\n", + " function_args = json.loads(tool_call.function.arguments)# use json.loads to convert a string to a dictionary\n", + " # function_response = function_to_call(\n", + " # location=function_args.get(\"location\"),\n", + " # unit=function_args.get(\"unit\"),\n", + " # ) \n", + " # you have to exactly know the arguments, this does not make sense. How would i know its arguments. **function_args (makes more sense)\n", + " function_response = function_to_call(**function_args)\n", + " messages.append(\n", + " {\n", + " \"tool_call_id\": tool_call.id,\n", + " \"role\": \"tool\",\n", + " \"name\": function_name,\n", + " \"content\": function_response,\n", + " }\n", + " ) # extend conversation with function response\n", + " second_response = client.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=messages,\n", + " ) # get a new response from the model where it can see the function response\n", + " return second_response\n", + "print(run_conversation())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Function(arguments='{\"location\": \"Tokyo, Japan\", \"unit\": \"celsius\"}', name='get_current_weather'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are two important pieces. Getting function schema is not difficult and can be standarized.\n", + "\n", + "The second piece is how to call the function, and how to execute it. The how to call the function depends on how we execute it.\n", + "\n", + "How to execute a function:\n", + "1. Eval (LLM will output the code to call the function (in string format))-> Language generation.\n", + "2. We manage a function map, and we ask LLm to output either the code string or a structure with the function name and the arguments. We can use the function map to call the function. If its code string, we will have to parse the function call into the name and the arguments. If its a structure, we will have to convert it to data structure that can be used to call the function.\n", + "\n", + "There are just so many different ways to do the actual function call, and different LLM might react differetntly in accuracy to each output format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Function(arguments='{\"location\": \"Paris, France\"}', name='get_current_weather'), type='function')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def get_current_weather(location: str, unit: str = \"fahrenheit\"):\n", + " \"\"\"Get the current weather in a given location\"\"\"\n", + " if \"tokyo\" in location.lower():\n", + " return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": unit})\n", + " elif \"san francisco\" in location.lower():\n", + " return json.dumps(\n", + " {\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": unit}\n", + " )\n", + " elif \"paris\" in location.lower():\n", + " return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": unit})\n", + " else:\n", + " return json.dumps({\"location\": location, \"temperature\": \"unknown\"})\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# v2\n", + "\n", + "from lightrag.core.base_data_class import DataClass\n", + "from dataclasses import dataclass, field\n", + "\n", + "@dataclass\n", + "class Weather(DataClass):\n", + " location: str = field(metadata={\"description\": \"The city and state, e.g. San Francisco, CA\"})\n", + " unit: str = field(metadata={\"enum\": [\"celsius\", \"fahrenheit\"]})\n", + "\n", + "def get_current_weather_2(weather: Weather):\n", + " \"\"\"Get the current weather in a given location\"\"\"\n", + " if \"tokyo\" in weather.location.lower():\n", + " return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": weather.unit})\n", + " elif \"san francisco\" in weather.location.lower():\n", + " return json.dumps(\n", + " {\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": weather.unit}\n", + " )\n", + " elif \"paris\" in weather.location.lower():\n", + " return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": weather.unit})\n", + " else:\n", + " return json.dumps({\"location\": weather.location, \"temperature\": \"unknown\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name: weather, parameter: weather: __main__.Weather \n", + "type_hints[name]: \n", + "name: location, parameter: location: str \n", + "name: unit, parameter: unit: str \n", + "{\n", + " \"name\": \"get_current_weather_2\",\n", + " \"description\": \"get_current_weather_2(weather: __main__.Weather)\\nGet the current weather in a given location\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"weather\": {\n", + " \"type\": \"Weather\",\n", + " \"description\": \"The city and state, e.g. San Francisco, CA\",\n", + " \"enum\": [\n", + " \"celsius\",\n", + " \"fahrenheit\"\n", + " ]\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"weather\"\n", + " ],\n", + " \"definitions\": {\n", + " \"weather\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"unit\": {\n", + " \"type\": \"str\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"location\",\n", + " \"unit\"\n", + " ]\n", + " }\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "# Create a tool from the class\n", + "\n", + "tool_2 = FunctionTool.from_defaults(fn=get_current_weather_2)\n", + "\n", + "print(tool_2.metadata.to_json())\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Llamaindex\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lightrag_fn_schema =\n", + "{\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"weather\": {\n", + " \"type\": \"Weather\",\n", + " \"desc\": \"The city and state, e.g. San Francisco, CA\",\n", + " \"enum\": [\n", + " \"celsius\",\n", + " \"fahrenheit\"\n", + " ]\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"weather\"\n", + " ],\n", + " \"definitions\": {\n", + " \"weather\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"type\": \"str\"\n", + " },\n", + " \"unit\": {\n", + " \"type\": \"str\"\n", + " }\n", + " },\n", + " \"required\": [\n", + " \"location\",\n", + " \"unit\"\n", + " ]\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " llama_fn_schema = {\n", + " \"type\": \"object\",\n", + " \"properties\": {\"weather\": {\"$ref\": \"#/definitions/Weather\"}},\n", + " \"required\": [\"weather\"],\n", + " \"definitions\": {\n", + " \"Weather\": {\n", + " \"title\": \"Weather\",\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"title\": \"Location\",\n", + " \"desc\": \"The city and state, e.g. San Francisco, CA\",\n", + " \"type\": \"string\",\n", + " },\n", + " \"unit\": {\n", + " \"title\": \"Unit\",\n", + " \"enum\": [\"celsius\", \"fahrenheit\"],\n", + " \"type\": \"string\",\n", + " },\n", + " },\n", + " \"required\": [\"location\", \"unit\"],\n", + " \"additionalProperties\": false,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# level 1, call function with default python data types\n", + "# such as str, int, float, list, dict, etc.\n", + "\n", + "def _get_current_weather(location: str, unit: str = \"fahrenheit\"):\n", + " \"\"\"Get the current weather in a given location\"\"\"\n", + " if \"tokyo\" in location.lower():\n", + " return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": unit})\n", + " elif \"san francisco\" in location.lower():\n", + " return json.dumps(\n", + " {\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": unit}\n", + " )\n", + " elif \"paris\" in location.lower():\n", + " return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": unit})\n", + " else:\n", + " return json.dumps({\"location\": location, \"temperature\": \"unknown\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FunctionTool(metadata=ToolMetadata(name='_get_current_weather', description=\"_get_current_weather(location: str, unit: str = 'fahrenheit')\\nGet the current weather in a given location\", parameters={'type': 'object', 'properties': {'location': {'type': 'str'}, 'unit': {'type': 'str', 'default': 'fahrenheit'}}, 'required': ['location']}), fn=, async_fn=None)\n" + ] + } + ], + "source": [ + "# prepare function tool \n", + "weather_tool = FunctionTool.from_defaults(fn=_get_current_weather)\n", + "print(weather_tool)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.3, 'stream': False}, \n", + " (prompt): Prompt(\n", + " template: You have these tools available:\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n", + " Tool Description: {{ tool.metadata.description }}\n", + " Tool Parameters: {{ tool.metadata.fn_schema_str }} \n", + " __________\n", + " {% endfor %}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " , prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nname: The name of the function (str) (required)\\nargs: The arguments of the function (Dict) (required)\\n```\\n\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-Quote the string values properly.\\n'}, prompt_variables: ['input_str', 'output_format_str', 'tools']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): YamlOutputParser(\n", + " data_class_for_yaml=\n", + " (yaml_output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard YAML instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " {% if example %}\n", + " Here is an example:\n", + " ```\n", + " {{example}}\n", + " ```\n", + " {% endif %}\n", + " \n", + " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", + " -Quote the string values properly.\n", + " , prompt_variables: ['example', 'schema']\n", + " )\n", + " (output_processors): YamlParser()\n", + " )\n", + ")" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prepare a minimal function calling template \n", + "template = r\"\"\"You have these tools available:\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n", + " Tool Description: {{ tool.metadata.description }}\n", + " Tool Parameters: {{ tool.metadata.fn_schema_str }} \n", + " __________\n", + " {% endfor %}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " \"\"\"\n", + "\n", + "multiple_function_call_template = r\"\"\"You can answer user query with these tools:\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n", + " Tool Description: {{ tool.metadata.description }}\n", + " Tool Parameters: {{ tool.metadata.fn_schema_str }} \n", + " __________\n", + " {% endfor %}\n", + " \n", + " You can call multiple tools by return a list of the following format:\n", + " {{output_format_str}}\n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " \"\"\"\n", + "\n", + "from typing import Dict, Any\n", + "from lightrag.core.generator import Generator\n", + "from lightrag.core.types import ModelClientType\n", + "from lightrag.components.output_parsers import YamlOutputParser\n", + "\n", + "model_kwargs = {\"model\": \"gpt-3.5-turbo\", \"temperature\": 0.3, \"stream\": False}\n", + "\n", + "@dataclass\n", + "class Function(DataClass):\n", + " name: str = field(metadata={\"desc\": \"The name of the function\"})\n", + " args: Dict[str, Any] = field(metadata={\"desc\": \"The arguments of the function\"})\n", + "\n", + "generator = Generator(\n", + " model_client=ModelClientType.OPENAI(),\n", + " model_kwargs=model_kwargs,\n", + " template=template,\n", + " prompt_kwargs={\n", + " # \"tools\": [weather_tool],\n", + " \"output_format_str\": YamlOutputParser(Function).format_instructions(),\n", + " # \"output_format_str\": Function.to_yaml_signature(),\n", + " },\n", + " output_processors=YamlOutputParser(Function),\n", + ")\n", + "generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prompt:\n", + "You have these tools available:\n", + " \n", + " 1. ToolName: _get_current_weather\n", + " Tool Description: _get_current_weather(location: str, unit: str = 'fahrenheit')\n", + "Get the current weather in a given location\n", + " Tool Parameters: {\"type\": \"object\", \"properties\": {\"location\": {\"type\": \"str\"}, \"unit\": {\"type\": \"str\", \"default\": \"fahrenheit\"}}, \"required\": [\"location\"]} \n", + " __________\n", + " \n", + " Your output should be formatted as a standard YAML instance with the following schema:\n", + "```\n", + "name: The name of the function (str) (required)\n", + "args: The arguments of the function (Dict) (required)\n", + "```\n", + "\n", + "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + "-Follow the YAML formatting conventions with an indent of 2 spaces.\n", + "-Quote the string values properly.\n", + "\n", + " \n", + " User: What's the weather like in San Francisco, Tokyo, and Paris in celsius?\n", + " You:\n", + " \n" + ] + } + ], + "source": [ + "# check the prompt\n", + "\n", + "input_str = \"What's the weather like in San Francisco, Tokyo, and Paris in celsius?\"\n", + "\n", + "generator.print_prompt(input_str=input_str, tools=[weather_tool])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Function(name='_get_current_weather', args={'location': 'San Francisco', 'unit': 'celsius'})\n" + ] + } + ], + "source": [ + "prompt_kwargs = {\n", + " \"input_str\": input_str,\n", + " \"tools\": [weather_tool],\n", + "}\n", + "output = generator(prompt_kwargs=prompt_kwargs)\n", + "structured_output = Function.from_dict(output.data)\n", + "print(structured_output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": \"celsius\"}\n" + ] + } + ], + "source": [ + "# call the function\n", + "\n", + "function_map = {\n", + " \"_get_current_weather\": weather_tool\n", + "}\n", + "\n", + "function_name = structured_output.name\n", + "function_args = structured_output.args\n", + "function_to_call = function_map[function_name]\n", + "function_response = function_to_call(**function_args)\n", + "print(function_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# multiple function calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.3, 'stream': False}, \n", + " (prompt): Prompt(\n", + " template: You can answer user query with these tools:\n", + " \n", + " {% for tool in tools %}\n", + " {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n", + " Tool Description: {{ tool.metadata.description }}\n", + " Tool Parameters: {{ tool.metadata.fn_schema_str }} \n", + " __________\n", + " {% endfor %}\n", + " \n", + " You can call multiple tools by return a list of the following format:\n", + " {{output_format_str}}\n", + " \n", + " User: {{input_str}}\n", + " You:\n", + " , prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nname: The name of the function (str) (required)\\nargs: The arguments of the function (Dict) (required)\\n```\\n\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-Quote the string values properly.\\n'}, prompt_variables: ['input_str', 'output_format_str', 'tools']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): YamlOutputParser(\n", + " data_class_for_yaml=\n", + " (yaml_output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard YAML instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " {% if example %}\n", + " Here is an example:\n", + " ```\n", + " {{example}}\n", + " ```\n", + " {% endif %}\n", + " \n", + " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", + " -Quote the string values properly.\n", + " , prompt_variables: ['example', 'schema']\n", + " )\n", + " (output_processors): YamlParser()\n", + " )\n", + ")" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generator = Generator(\n", + " model_client=ModelClientType.OPENAI(),\n", + " model_kwargs=model_kwargs,\n", + " template=multiple_function_call_template,\n", + " prompt_kwargs={\n", + " # \"tools\": [weather_tool],\n", + " \"output_format_str\": YamlOutputParser(Function).format_instructions(),\n", + " # \"output_format_str\": Function.to_yaml_signature(),\n", + " },\n", + " output_processors=YamlOutputParser(Function),\n", + ")\n", + "generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GeneratorOutput(data=[{'name': '_get_current_weather', 'args': {'location': 'San Francisco', 'unit': 'celsius'}}, {'name': '_get_current_weather', 'args': {'location': 'Tokyo', 'unit': 'celsius'}}, {'name': '_get_current_weather', 'args': {'location': 'Paris', 'unit': 'celsius'}}], error=None, usage=None, raw_response='```yaml\\n- name: _get_current_weather\\n args:\\n location: \"San Francisco\"\\n unit: \"celsius\"\\n- name: _get_current_weather\\n args:\\n location: \"Tokyo\"\\n unit: \"celsius\"\\n- name: _get_current_weather\\n args:\\n location: \"Paris\"\\n unit: \"celsius\"\\n```')\n", + "[Function(name='_get_current_weather', args={'location': 'San Francisco', 'unit': 'celsius'}), Function(name='_get_current_weather', args={'location': 'Tokyo', 'unit': 'celsius'}), Function(name='_get_current_weather', args={'location': 'Paris', 'unit': 'celsius'})]\n" + ] + } + ], + "source": [ + "# run the query\n", + "\n", + "output = generator(prompt_kwargs=prompt_kwargs)\n", + "list_structured_output = [Function.from_dict(item) for item in output.data]\n", + "print(output)\n", + "print(list_structured_output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"celsius\"}\n", + "{\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": \"celsius\"}\n", + "{\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": \"celsius\"}\n" + ] + } + ], + "source": [ + "for structured_output in list_structured_output:\n", + " function_name = structured_output.name\n", + " function_args = structured_output.args\n", + " function_to_call = function_map[function_name]\n", + " function_response = function_to_call(**function_args)\n", + " print(function_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Person(name='John Doe', age=30, address=Address(street='123 Main St', city='Anytown', zipcode='12345'))\n", + "{'name': 'John Doe', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Anytown', 'zipcode': '12345'}}\n" + ] + } + ], + "source": [ + "from dataclasses import dataclass, field\n", + "from typing import Any, Dict\n", + "\n", + "@dataclass\n", + "class Address:\n", + " street: str\n", + " city: str\n", + " zipcode: str\n", + "\n", + "@dataclass\n", + "class Person:\n", + " name: str\n", + " age: int\n", + " address: Address\n", + "\n", + "# Example instance of the nested dataclasses\n", + "person = Person(name=\"John Doe\", age=30, address=Address(street=\"123 Main St\", city=\"Anytown\", zipcode=\"12345\"))\n", + "print(person)\n", + "\n", + "def to_dict(obj: Any) -> Dict[str, Any]:\n", + " if hasattr(obj, \"__dataclass_fields__\"):\n", + " return {key: to_dict(value) for key, value in obj.__dict__.items()}\n", + " elif isinstance(obj, list):\n", + " return [to_dict(item) for item in obj]\n", + " elif isinstance(obj, dict):\n", + " return {key: to_dict(value) for key, value in obj.items()}\n", + " else:\n", + " return obj\n", + "\n", + "# Convert the person instance to a dictionary\n", + "person_dict = to_dict(person)\n", + "print(person_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Person(name='John Doe', age=30, addresses=[Address(street='123 Main St', city='Anytown', zipcode='12345'), Address(street='456 Elm St', city='Othertown', zipcode='67890')])\n" + ] + } + ], + "source": [ + "from typing import List\n", + "@dataclass\n", + "class Address:\n", + " street: str\n", + " city: str\n", + " zipcode: str\n", + "\n", + "@dataclass\n", + "class Person:\n", + " name: str\n", + " age: int\n", + " addresses: List[Address]\n", + "\n", + "# Example instance of the nested dataclasses\n", + "person = Person(name=\"John Doe\", age=30, addresses=[Address(street=\"123 Main St\", city=\"Anytown\", zipcode=\"12345\"), Address(street=\"456 Elm St\", city=\"Othertown\", zipcode=\"67890\")])\n", + "print(person)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'John Doe', 'age': 30, 'addresses': [{'street': '123 Main St', 'city': 'Anytown', 'zipcode': '12345'}, {'street': '456 Elm St', 'city': 'Othertown', 'zipcode': '67890'}]}\n" + ] + } + ], + "source": [ + "# Convert the person instance to a dictionary\n", + "person_dict = to_dict(person)\n", + "print(person_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': [{'question': 'What is the capital of France?'}]}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from typing import List, Dict, Optional\n", + "def dataclass_obj_to_dict(\n", + " obj: Any, exclude: Optional[Dict[str, List[str]]] = None, parent_key: str = \"\"\n", + ") -> Dict[str, Any]:\n", + " r\"\"\"Convert a dataclass object to a dictionary.\n", + "\n", + " Supports nested dataclasses, lists, and dictionaries.\n", + " Allow exclude keys for each dataclass object.\n", + " Example:\n", + "\n", + " .. code-block:: python\n", + "\n", + " from dataclasses import dataclass\n", + " from typing import List\n", + "\n", + " @dataclass\n", + " class TrecData:\n", + " question: str\n", + " label: int\n", + "\n", + " @dataclass\n", + " class TrecDataList:\n", + "\n", + " data: List[TrecData]\n", + " name: str\n", + "\n", + " trec_data = TrecData(question=\"What is the capital of France?\", label=0)\n", + " trec_data_list = TrecDataList(data=[trec_data], name=\"trec_data_list\")\n", + "\n", + " dataclass_obj_to_dict(trec_data_list, exclude={\"TrecData\": [\"label\"], \"TrecDataList\": [\"name\"]})\n", + "\n", + " # Output:\n", + " # {'data': [{'question': 'What is the capital of France?'}], 'name': 'trec_data_list'}\n", + "\n", + " \"\"\"\n", + " if exclude is None:\n", + " exclude = {}\n", + "\n", + " obj_class_name = obj.__class__.__name__\n", + " current_exclude = exclude.get(obj_class_name, [])\n", + "\n", + " if hasattr(obj, \"__dataclass_fields__\"):\n", + " return {\n", + " key: dataclass_obj_to_dict(value, exclude, parent_key=key)\n", + " for key, value in obj.__dict__.items()\n", + " if key not in current_exclude\n", + " }\n", + " elif isinstance(obj, list):\n", + " return [dataclass_obj_to_dict(item, exclude, parent_key) for item in obj]\n", + " elif isinstance(obj, dict):\n", + " return {\n", + " key: dataclass_obj_to_dict(value, exclude, parent_key)\n", + " for key, value in obj.items()\n", + " }\n", + " else:\n", + " return obj\n", + "\n", + "from dataclasses import dataclass\n", + "from typing import List\n", + "\n", + "@dataclass\n", + "class TrecData:\n", + " question: str\n", + " label: int\n", + "\n", + "@dataclass\n", + "class TrecDataList:\n", + "\n", + " data: List[TrecData]\n", + " name: str\n", + "\n", + "trec_data = TrecData(question=\"What is the capital of France?\", label=0)\n", + "trec_data_list = TrecDataList(data=[trec_data], name=\"trec_data_list\")\n", + "\n", + "dataclass_obj_to_dict(trec_data_list, exclude={\"TrecData\": [\"label\"], \"TrecDataList\": [\"name\"]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Type\n", + "def dataclass_obj_from_dict(cls: Type[Any], data: Dict[str, Any]) -> Any:\n", + " if hasattr(cls, \"__dataclass_fields__\"):\n", + " fieldtypes = {f.name: f.type for f in cls.__dataclass_fields__.values()}\n", + " return cls(**{key: dataclass_obj_from_dict(fieldtypes[key], value) for key, value in data.items()})\n", + " elif isinstance(data, list):\n", + " return [dataclass_obj_from_dict(cls.__args__[0], item) for item in data]\n", + " elif isinstance(data, dict):\n", + " return {key: dataclass_obj_from_dict(cls.__args__[1], value) for key, value in data.items()}\n", + " else:\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TrecDataList(data=[TrecData(question='What is the capital of France?', label=0)], name='trec_data_list')" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataclass_obj_from_dict(TrecDataList, dataclass_obj_to_dict(trec_data_list))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "TrecData.__init__() missing 1 required positional argument: 'label'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[25], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mTrecDataList\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataclass_obj_to_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrec_data_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTrecData\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mTrecDataList\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[23], line 5\u001b[0m, in \u001b[0;36mdataclass_obj_from_dict\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__dataclass_fields__\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 4\u001b[0m fieldtypes \u001b[38;5;241m=\u001b[39m {f\u001b[38;5;241m.\u001b[39mname: f\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__dataclass_fields__\u001b[38;5;241m.\u001b[39mvalues()}\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[43m{\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfieldtypes\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [dataclass_obj_from_dict(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__args__[\u001b[38;5;241m0\u001b[39m], item) \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m data]\n", + "Cell \u001b[0;32mIn[23], line 5\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__dataclass_fields__\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 4\u001b[0m fieldtypes \u001b[38;5;241m=\u001b[39m {f\u001b[38;5;241m.\u001b[39mname: f\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__dataclass_fields__\u001b[38;5;241m.\u001b[39mvalues()}\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m{key: \u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfieldtypes\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [dataclass_obj_from_dict(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__args__[\u001b[38;5;241m0\u001b[39m], item) \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m data]\n", + "Cell \u001b[0;32mIn[23], line 7\u001b[0m, in \u001b[0;36mdataclass_obj_from_dict\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m{key: dataclass_obj_from_dict(fieldtypes[key], value) \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mlist\u001b[39m):\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__args__\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {key: dataclass_obj_from_dict(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__args__[\u001b[38;5;241m1\u001b[39m], value) \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()}\n", + "Cell \u001b[0;32mIn[23], line 7\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m{key: dataclass_obj_from_dict(fieldtypes[key], value) \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mlist\u001b[39m):\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__args__\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m data]\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {key: dataclass_obj_from_dict(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__args__[\u001b[38;5;241m1\u001b[39m], value) \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()}\n", + "Cell \u001b[0;32mIn[23], line 5\u001b[0m, in \u001b[0;36mdataclass_obj_from_dict\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__dataclass_fields__\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 4\u001b[0m fieldtypes \u001b[38;5;241m=\u001b[39m {f\u001b[38;5;241m.\u001b[39mname: f\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__dataclass_fields__\u001b[38;5;241m.\u001b[39mvalues()}\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataclass_obj_from_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfieldtypes\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mlist\u001b[39m):\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [dataclass_obj_from_dict(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__args__[\u001b[38;5;241m0\u001b[39m], item) \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m data]\n", + "\u001b[0;31mTypeError\u001b[0m: TrecData.__init__() missing 1 required positional argument: 'label'" + ] + } + ], + "source": [ + "dataclass_obj_from_dict(TrecDataList, dataclass_obj_to_dict(trec_data_list, exclude={\"TrecData\": [\"label\"], \"TrecDataList\": [\"name\"]}))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lightrag-project", + "language": "python", + "name": "light-rag-project" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/developer_notes/tools.py b/developer_notes/tools.py new file mode 100644 index 00000000..cbd2fac2 --- /dev/null +++ b/developer_notes/tools.py @@ -0,0 +1,360 @@ +from dataclasses import dataclass, field +from typing import List +import numpy as np +import time +import asyncio +from lightrag.core import Component, DataClass +from lightrag.core.types import Function, FunctionExpression +from lightrag.core.tool_manager import ToolManager +from lightrag.components.output_parsers import JsonOutputParser + +from lightrag.core.generator import Generator +from lightrag.core.types import ModelClientType + + +def multiply(a: int, b: int) -> int: + """Multiply two numbers.""" + time.sleep(2) + return a * b + + +def add(a: int, b: int) -> int: + """Add two numbers.""" + time.sleep(3) + return a + b + + +async def divide(a: float, b: float) -> float: + """Divide two numbers.""" + await asyncio.sleep(3) + return float(a) / b + + +async def search(query: str) -> List[str]: + """Search for query and return a list of results.""" + await asyncio.sleep(1) + return ["result1" + query, "result2" + query] + + +def numpy_sum(arr: np.ndarray) -> float: + """Sum the elements of an array.""" + return np.sum(arr) + + +x = 2 + + +@dataclass +class Point: + x: int + y: int + + +def add_points(p1: Point, p2: Point) -> Point: + return Point(p1.x + p2.x, p1.y + p2.y) + + +@dataclass # TODO: data class schema, need to go all the way down to the subclass +class MultipleFunctionDefinition(DataClass): + a: Point = field(metadata={"desc": "First number"}) + b: int = field(metadata={"desc": "Second number"}) + + +# optionally to define schma yourself, this can be used to generate FunctionDefinition +print(MultipleFunctionDefinition.to_schema_str()) +# use function tool + +template = r"""You have these tools available: +{% if tools %} + +{% for tool in tools %} +{{ loop.index }}. +{{tool}} +------------------------ +{% endfor %} + +{% endif %} + +{{output_format_str}} + + +User: {{input_str}} +You: +""" + +template_with_context = r"""You have these tools available: +{% if tools %} + +{% for tool in tools %} +{{ loop.index }}. +{{tool}} +------------------------ +{% endfor %} + +{% endif %} + +Your function expression also have access to these context: +{{context_str}} + + +{{output_format_str}} + + +User: {{input_str}} +You: +""" + +multple_function_call_template = r"""You have these tools available: +{% if tools %} + +{% for tool in tools %} +{{ loop.index }}. +{{tool}} +------------------------ +{% endfor %} + +{% endif %} + +Here is how you call one function. +{{output_format_str}} +-Always return a List using `[]` of the above JSON objects, even if its just one item. + + +{{input_str}} +You: +""" + +queries = [ + "add 2 and 3", + "search for something", + "add points (1, 2) and (3, 4)", + "sum numpy array with arr = np.array([[1, 2], [3, 4]])", + "multiply 2 with local variable x", + "divide 2 by 3", + "Add 5 to variable y", +] +functions = [multiply, add, divide, search, numpy_sum, add_points] + + +class FunctionCall(Component): + def __init__(self): + super().__init__() + + def prepare_single_function_call_generator(self): + tool_manager = ToolManager(tools=functions) + func_parser = JsonOutputParser( + data_class=Function, exclude_fields=["thought", "args"] + ) + + model_kwargs = {"model": "gpt-3.5-turbo"} + prompt_kwargs = { + "tools": tool_manager.yaml_definitions, + "output_format_str": func_parser.format_instructions(), + } + generator = Generator( + model_client=ModelClientType.OPENAI(), + model_kwargs=model_kwargs, + template=template, + prompt_kwargs=prompt_kwargs, + output_processors=func_parser, + ) + generator.print_prompt(**prompt_kwargs) + return generator, tool_manager + + def run_function_call(self, generator: Generator, tool_manager: ToolManager): + + for idx, query in enumerate(queries): + prompt_kwargs = {"input_str": query} + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs=prompt_kwargs) + # print(f"LLM raw output: {result.raw_response}") + func = Function.from_dict(result.data) + print(f"Function: {func}") + func_output = tool_manager.execute_func(func) + print(f"Function output: {func_output}") + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + + +class FunctionCallWithFunctionExpression(Component): + def __init__(self): + super().__init__() + + def prepare_single_function_call_generator(self): + tool_manager = ToolManager( + tools=functions, + additional_context={ + "x": x, + "y": 0, + "np.array": np.array, + "np": np, + "Point": Point, + }, + ) + func_parser = JsonOutputParser( + data_class=FunctionExpression, exclude_fields=["thought", "args"] + ) + instructions = func_parser.format_instructions() + print(instructions) + + model_kwargs = {"model": "gpt-4o"} + prompt_kwargs = { + "tools": tool_manager.yaml_definitions, + "output_format_str": func_parser.format_instructions(), + "context_str": tool_manager._additional_context, + } + generator = Generator( + model_client=ModelClientType.OPENAI(), + model_kwargs=model_kwargs, + template=template_with_context, + prompt_kwargs=prompt_kwargs, + output_processors=func_parser, + ) + generator.print_prompt(**prompt_kwargs) + return generator, tool_manager + + def run_function_call(self, generator: Generator, tool_manager: ToolManager): + start_time = time.time() + for idx, query in enumerate(queries): + prompt_kwargs = {"input_str": query} + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs=prompt_kwargs) + # print(f"LLM raw output: {result.raw_response}") + func_expr = FunctionExpression.from_dict(result.data) + print(f"Function_expr: {func_expr}") + # func: Function = tool_manager.parse_func_expr(func_expr) + # func_output = tool_manager.execute_func(func) + # or + # func_output = tool_manager.execute_func_expr_via_sandbox(func_expr) + # or + func_output = tool_manager.execute_func_expr_via_eval(func_expr) + print(f"Function output: {func_output}") + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + end_time = time.time() + print(f"Total time taken: {end_time - start_time :.2f} seconds") + + async def run_async_function_call(self, generator, tool_manager): + answers = [] + start_time = time.time() + tasks = [] + for idx, query in enumerate(queries): + tasks.append(self.process_query(idx, query, generator, tool_manager)) + + results = await asyncio.gather(*tasks) + answers.extend(results) + end_time = time.time() + print(f"Total time taken: {end_time - start_time :.2f} seconds") + return answers + + async def process_query(self, idx, query, generator, tool_manager: ToolManager): + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs={"input_str": query}) + func_expr = FunctionExpression.from_dict(result.data) + print(f"Function_expr: {func_expr}") + # func = tool_manager.parse_func_expr(func_expr) + # func_output = await tool_manager.execute_func(func) + # or + func_output = await tool_manager.execute_func_expr(func_expr) + + print(f"Function output: {func_output}") + return func_output + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + return None + + +class MultiFunctionCallWithFunctionExpression(Component): + def __init__(self): + super().__init__() + + def prepare_single_function_call_generator(self): + tool_manager = ToolManager( + tools=functions, + additional_context={ + "x": x, + "y": 0, + "np.array": np.array, + "np": np, + "Point": Point, + }, + ) + example = FunctionExpression.from_function( + func=add_points, p1=Point(x=1, y=2), p2=Point(x=3, y=4) + ) + func_parser = JsonOutputParser( + data_class=FunctionExpression, + examples=[example], + exclude_fields=["thought"], + ) + instructions = func_parser.format_instructions() + print(instructions) + + model_kwargs = {"model": "gpt-4o"} + prompt_kwargs = { + "tools": tool_manager.yaml_definitions, + "output_format_str": func_parser.format_instructions(), + } + generator = Generator( + model_client=ModelClientType.OPENAI(), + model_kwargs=model_kwargs, + template=multple_function_call_template, + prompt_kwargs=prompt_kwargs, + output_processors=func_parser, + ) + generator.print_prompt(**prompt_kwargs) + return generator, tool_manager + + def run_function_call(self, generator: Generator, tool_manager: ToolManager): + start_time = time.time() + for idx in range(0, len(queries), 2): + query = " and ".join(queries[idx : idx + 2]) + prompt_kwargs = {"input_str": query} + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs=prompt_kwargs) + print(f"LLM raw output: {result.raw_response}") + func_expr: List[FunctionExpression] = [ + FunctionExpression.from_dict(item) for item in result.data + ] + print(f"Function_expr: {func_expr}") + for expr in func_expr: + func_output = tool_manager.execute_func_expr_via_eval(expr) + print(f"Function output: {func_output}") + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + end_time = time.time() + print(f"Total time taken: {end_time - start_time :.2f} seconds") + + +if __name__ == "__main__": + # fc = FunctionCall() + # generator, tool_manager = fc.prepare_single_function_call_generator() + # fc.run_function_call(generator, tool_manager) + + # fc = FunctionCallWithFunctionExpression() + # generator, tool_manager = fc.prepare_single_function_call_generator() + # fc.run_function_call(generator, tool_manager) # 15.92s + # asyncio.run(fc.run_async_function_call(generator, tool_manager)) # 7.8s + + output = eval("add(a=y, b=5)", {"y": 3, "add": add}) + print(output) + + mul_fc = MultiFunctionCallWithFunctionExpression() + generator, tool_manager = mul_fc.prepare_single_function_call_generator() + mul_fc.run_function_call(generator, tool_manager) diff --git a/docs/.gitignore b/docs/.gitignore index 9f2cc5a9..b72c4b2c 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,15 +1,15 @@ -source/apis/components/components* -source/apis/components/_autosummary* -source/apis/components/modules* -source/apis/core/core* -source/apis/core/modules* -source/apis/eval/eval* -source/apis/eval/modules* -source/apis/prompts/prompts* -source/apis/prompts/modules* -source/apis/utils/utils* -source/apis/utils/modules* -source/apis/tracing/tracing* -source/apis/tracing/modules* -source/apis/optim/optim* -source/apis/optim/modules* \ No newline at end of file +# source/apis/components/components* +# source/apis/components/_autosummary* +# source/apis/components/modules* +# source/apis/core/core* +# source/apis/core/modules* +# source/apis/eval/eval* +# source/apis/eval/modules* +# source/apis/prompts/prompts* +# source/apis/prompts/modules* +# source/apis/utils/utils* +# source/apis/utils/modules* +# source/apis/tracing/tracing* +# source/apis/tracing/modules* +# source/apis/optim/optim* +# source/apis/optim/modules* diff --git a/docs/Makefile b/docs/Makefile index cb659334..3b198b52 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -22,12 +22,12 @@ help: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) apidoc: - @sphinx-apidoc -o $(APIDOCOUTDIR)/core ../lightrag/core --separate --force - @sphinx-apidoc -o $(APIDOCOUTDIR)/components ../lightrag/components --separate --force --templatedir=$(SOURCEDIR)/_templates - @sphinx-apidoc -o $(APIDOCOUTDIR)/eval ../lightrag/eval --separate --force - @sphinx-apidoc -o $(APIDOCOUTDIR)/optim ../lightrag/optim --separate --force - @sphinx-apidoc -o $(APIDOCOUTDIR)/utils ../lightrag/utils --separate --force - @sphinx-apidoc -o $(APIDOCOUTDIR)/tracing ../lightrag/tracing --separate --force + @sphinx-apidoc -o $(APIDOCOUTDIR)/core ../lightrag/lightrag/core --separate --force + @sphinx-apidoc -o $(APIDOCOUTDIR)/components ../lightrag/lightrag/components --separate --force --templatedir=$(SOURCEDIR)/_templates + @sphinx-apidoc -o $(APIDOCOUTDIR)/eval ../lightrag/lightrag/eval --separate --force + @sphinx-apidoc -o $(APIDOCOUTDIR)/optim ../lightrag/lightrag/optim --separate --force + @sphinx-apidoc -o $(APIDOCOUTDIR)/utils ../lightrag/lightrag/utils --separate --force + @sphinx-apidoc -o $(APIDOCOUTDIR)/tracing ../lightrag/lightrag/tracing --separate --force @echo "Inserting reference labels into RST files." @python $(SOURCEDIR)/insert_labels.py @echo "Removing unnecessary strings for better formatting" @@ -38,4 +38,4 @@ apidoc: html: apidoc - @$(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index e03d3149..ee2e8aff 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,7 +9,7 @@ ## How the Documentation Works -We use [Sphinx](https://www.sphinx-doc.org/en/master/) as the documentation tool and [reStructuredText](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) as the language. Sphinx primarily reads configurations from a Python script (`conf.py`), pulls documentation from comments in the code (via the `autodoc` extension), and organizes content through its table of contents hierarchy defined in `.rst` files. +We use [Sphinx](https://www.sphinx-doc.org/en/master/) as the documentation tool and [reStructuredText](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) as the language. Sphinx primarily reads configurations from a Python script (`conf.py`), pulls documentation from comments in the code (via the `autodoc` extension), and organizes content through its table of contents hierarchy defined in `.rst` files. ## Setup @@ -19,7 +19,24 @@ We use [Sphinx](https://www.sphinx-doc.org/en/master/) as the documentation tool ### **2. Install Necessary Packages** -`pip install sphinx sphinx-rtd-theme` +You can use either `poetry` or `pip` to install the necessary packages. + +**Use `poetry`:** + + +All the packages are manged in the project's ``pyproject.toml`` file in the doc dependencies section. You can install all the necessary packages by running: + +``` +poetry install --with doc +``` + +**Use `pip`:** + +Or you can use `pip` to install the necessary packages listed in ``requirements.txt``: + +``` +pip install -r requirements.txt +``` ### **3. Build the Documentation** @@ -28,6 +45,12 @@ cd docs make html ``` +Build with more options: + +``` +sphinx-build -b html source build -v +``` + ### **4. View the Documentation** After building the documentation, you can use any browser to view it by opening the `index.html` file located in `docs/build/html`. @@ -71,7 +94,7 @@ LightRAG comes from the best of the AI research and engineering. Fundamentally, ### **Existing Sections** -Existing sections include: +Existing sections include: `get_started/`: Includes installation and LightRAG introduction @@ -85,7 +108,7 @@ Existing sections include: Most of the documentation updates should be written as comments/doc-strings in your source code, which will be automatically converted to docs. Do manual editing when you add instructions to use your code, adjust the layout, etc. -The existing documentation is a combination of automatic generation and human editing. +The existing documentation is a combination of automatic generation and human editing. ### **Source Code Doc-string Update** @@ -109,7 +132,7 @@ If you add new modules or code to the project, sphinx has a [command](https://ww sphinx-apidoc [OPTIONS] -o [EXCLUDE_PATTERN …] ``` -***Note:*** +***Note:*** If your new module is a folder, it should contain a `__init__.py` file. @@ -125,7 +148,7 @@ sphinx-apidoc -o docs/source/tutorials ./use_cases **test** (*test* is to exclude the files containing `test` in the filename) -You will find a `modules.rst` and a `use_cases.rst` in the `docs/source/tutorials`. The `use_cases.rst` contains all the packages included in your `./use_cases`. +You will find a `modules.rst` and a `use_cases.rst` in the `docs/source/tutorials`. The `use_cases.rst` contains all the packages included in your `./use_cases`. Then you should add the link to the `index.rst` to show your source code and docs in the documentation. Find `docs/source/index.rst` and add the new section: @@ -134,11 +157,11 @@ Then you should add the link to the `index.rst` to show your source code and doc :glob: :maxdepth: 1 :caption: Use Cases - + tutorials/use_cases ``` -Then run: +Then run: ```python cd docs @@ -153,9 +176,9 @@ And you will be able to find the newly added use_cases module. If you want to add any written files such as README.md to the documentation, there is an easy way to transform the files to `.rst` files using `Pandoc`. - First, install Pandoc with Homebrew: - - `brew install pandoc` - + + `brew install pandoc` + - Then run `pandoc -s -o `. For example, in the root directory run `pandoc -s README.md -o docs/source/get_started/introduction.rst`.This command will take content from `README.md` and create an `introduction.rst` file in the specified directory. After editing, run @@ -208,4 +231,4 @@ LightRAG/ │ ├── __init__.py │ ├── module1.py │ ├── module2.py -``` \ No newline at end of file +``` diff --git a/docs/requirements.txt b/docs/requirements.txt index 428413ff..e59cca03 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,11 @@ -pydata-sphinx-theme==0.15.2 -Sphinx==7.3.7 -sphinx_design==0.6.0 -sphinx-copybutton==0.5.2 \ No newline at end of file +pydata-sphinx-theme==0.15.3 +sphinx-design==0.6.0 +sphinx-copybutton==0.5.2 +sphinx==7.3.7 +nbsphinx==0.9.4 +nbconvert==7.16.4 +PyYAML +readthedocs-sphinx-search==0.3.2 +numpy +tqdm +tiktoken \ No newline at end of file diff --git a/docs/source/_static/class_hierarchy.html b/docs/source/_static/class_hierarchy.html new file mode 100644 index 00000000..f6f6256f --- /dev/null +++ b/docs/source/_static/class_hierarchy.html @@ -0,0 +1,155 @@ + + + + + + + + + +
+

+
+ + + + + + +
+

+
+ + + + + +
+ + +
+
+ + + + + + + diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css index 8f4f9369..a97701dd 100644 --- a/docs/source/_static/css/custom.css +++ b/docs/source/_static/css/custom.css @@ -5,12 +5,33 @@ --pst-color-logo: #2EB5EB; --bs-gray-500:#adb5bd; + +} +.theme-version { + display: none; +} +.bd-sidebar-primary { + width: 100%; /* Full width by default */ +} + +/* Adjust width for larger screens */ +@media (min-width: 768px) { /* Example breakpoint for tablets and larger */ + .bd-sidebar-primary { + width: 270px; /* Adjust the width for larger screens */ } +} +.pre { + color: #0a7d91; /* Change the color of ```` blocks */ +} + +/* .copyright { + text-align: center; +} */ p { font-size: 0.9em; margin-bottom: 1.15rem; } - + html[data-theme=light] { --pst-color-secondary: #3d3d3d; /*change the secondary color, header link to gray */ --pst-color-link-hover: #25262; /*change the side bar link color to black */ @@ -27,7 +48,7 @@ h1{ font-size: 2rem; /* make the h1 in the code smaller */ } /* .bd-page-width { - max-width: 100%; + max-width: 100%; } */ .sig-name { diff --git a/docs/source/_static/database.png b/docs/source/_static/images/database.png similarity index 100% rename from docs/source/_static/database.png rename to docs/source/_static/images/database.png diff --git a/docs/source/_static/images/dataclass.png b/docs/source/_static/images/dataclass.png new file mode 100644 index 00000000..33f4c483 Binary files /dev/null and b/docs/source/_static/images/dataclass.png differ diff --git a/docs/source/_static/images/generator.png b/docs/source/_static/images/generator.png new file mode 100644 index 00000000..e3bb66cd Binary files /dev/null and b/docs/source/_static/images/generator.png differ diff --git a/docs/source/apis/components/_autosummary/components.agent.react.rst b/docs/source/apis/components/_autosummary/components.agent.react.rst new file mode 100644 index 00000000..9c0d4e93 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.agent.react.rst @@ -0,0 +1,20 @@ +components.agent.react +====================== + +.. automodule:: components.agent.react + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + ReActAgent diff --git a/docs/source/apis/components/_autosummary/components.api_client.anthropic_client.rst b/docs/source/apis/components/_autosummary/components.api_client.anthropic_client.rst new file mode 100644 index 00000000..31e030aa --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.api_client.anthropic_client.rst @@ -0,0 +1,22 @@ +.. _components-api_client-anthropic_client: + +components.api\_client.anthropic\_client +======================================== + +.. automodule:: components.api_client.anthropic_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + AnthropicAPIClient diff --git a/docs/source/apis/components/_autosummary/components.api_client.google_client.rst b/docs/source/apis/components/_autosummary/components.api_client.google_client.rst new file mode 100644 index 00000000..8da90bc6 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.api_client.google_client.rst @@ -0,0 +1,22 @@ +.. _components-api_client-google_client: + +components.api\_client.google\_client +===================================== + +.. automodule:: components.api_client.google_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + GoogleGenAIClient diff --git a/docs/source/apis/components/_autosummary/components.api_client.groq_client.rst b/docs/source/apis/components/_autosummary/components.api_client.groq_client.rst new file mode 100644 index 00000000..ccaab3bd --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.api_client.groq_client.rst @@ -0,0 +1,22 @@ +.. _components-api_client-groq_client: + +components.api\_client.groq\_client +=================================== + +.. automodule:: components.api_client.groq_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + GroqAPIClient diff --git a/docs/source/apis/components/_autosummary/components.api_client.openai_client.rst b/docs/source/apis/components/_autosummary/components.api_client.openai_client.rst new file mode 100644 index 00000000..4c14edec --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.api_client.openai_client.rst @@ -0,0 +1,22 @@ +.. _components-api_client-openai_client: + +components.api\_client.openai\_client +===================================== + +.. automodule:: components.api_client.openai_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + OpenAIClient diff --git a/docs/source/apis/components/_autosummary/components.api_client.transformers_client.rst b/docs/source/apis/components/_autosummary/components.api_client.transformers_client.rst new file mode 100644 index 00000000..0398fa6f --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.api_client.transformers_client.rst @@ -0,0 +1,29 @@ +.. _components-api_client-transformers_client: + +components.api\_client.transformers\_client +=========================================== + +.. automodule:: components.api_client.transformers_client + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + average_pool + + + + + + .. rubric:: Classes + + .. autosummary:: + + TransformerEmbedder + TransformersClient diff --git a/docs/source/apis/components/_autosummary/components.data_process.data_components.rst b/docs/source/apis/components/_autosummary/components.data_process.data_components.rst new file mode 100644 index 00000000..4482fe53 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.data_process.data_components.rst @@ -0,0 +1,27 @@ +components.data\_process.data\_components +========================================= + +.. automodule:: components.data_process.data_components + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + retriever_output_to_context_str + + + + + + .. rubric:: Classes + + .. autosummary:: + + RetrieverOutputToContextStr + ToEmbeddings diff --git a/docs/source/apis/components/_autosummary/components.data_process.document_splitter.rst b/docs/source/apis/components/_autosummary/components.data_process.document_splitter.rst new file mode 100644 index 00000000..205e73fc --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.data_process.document_splitter.rst @@ -0,0 +1,28 @@ +.. _components-data_process-document_splitter: + +components.data\_process.document\_splitter +=========================================== + +.. automodule:: components.data_process.document_splitter + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + split_text_by_token_fn + + + + + + .. rubric:: Classes + + .. autosummary:: + + DocumentSplitter diff --git a/docs/source/apis/components/_autosummary/components.data_process.text_splitter.rst b/docs/source/apis/components/_autosummary/components.data_process.text_splitter.rst new file mode 100644 index 00000000..63a89e71 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.data_process.text_splitter.rst @@ -0,0 +1,20 @@ +components.data\_process.text\_splitter +======================================= + +.. automodule:: components.data_process.text_splitter + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + TextSplitter diff --git a/docs/source/apis/components/_autosummary/components.memory.memory.rst b/docs/source/apis/components/_autosummary/components.memory.memory.rst new file mode 100644 index 00000000..0338de4d --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.memory.memory.rst @@ -0,0 +1,20 @@ +components.memory.memory +======================== + +.. automodule:: components.memory.memory + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + Memory diff --git a/docs/source/apis/components/_autosummary/components.model_client.anthropic_client.rst b/docs/source/apis/components/_autosummary/components.model_client.anthropic_client.rst new file mode 100644 index 00000000..3c6b45a7 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.anthropic_client.rst @@ -0,0 +1,20 @@ +components.model\_client.anthropic\_client +========================================== + +.. automodule:: components.model_client.anthropic_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + AnthropicAPIClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.cohere_client.rst b/docs/source/apis/components/_autosummary/components.model_client.cohere_client.rst new file mode 100644 index 00000000..03c29e46 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.cohere_client.rst @@ -0,0 +1,20 @@ +components.model\_client.cohere\_client +======================================= + +.. automodule:: components.model_client.cohere_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + CohereAPIClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.google_client.rst b/docs/source/apis/components/_autosummary/components.model_client.google_client.rst new file mode 100644 index 00000000..744dc3bc --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.google_client.rst @@ -0,0 +1,22 @@ +.. _components-model_client-google_client: + +components.model\_client.google\_client +======================================= + +.. automodule:: components.model_client.google_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + GoogleGenAIClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.groq_client.rst b/docs/source/apis/components/_autosummary/components.model_client.groq_client.rst new file mode 100644 index 00000000..02650f40 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.groq_client.rst @@ -0,0 +1,20 @@ +components.model\_client.groq\_client +===================================== + +.. automodule:: components.model_client.groq_client + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + GroqAPIClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.openai_client.rst b/docs/source/apis/components/_autosummary/components.model_client.openai_client.rst new file mode 100644 index 00000000..9b8bc7e7 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.openai_client.rst @@ -0,0 +1,28 @@ +components.model\_client.openai\_client +======================================= + +.. automodule:: components.model_client.openai_client + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + get_all_messages_content + get_first_message_content + get_probabilities + + + + + + .. rubric:: Classes + + .. autosummary:: + + OpenAIClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.transformers_client.rst b/docs/source/apis/components/_autosummary/components.model_client.transformers_client.rst new file mode 100644 index 00000000..cb1c9d33 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.transformers_client.rst @@ -0,0 +1,29 @@ +components.model\_client.transformers\_client +============================================= + +.. automodule:: components.model_client.transformers_client + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + average_pool + + + + + + .. rubric:: Classes + + .. autosummary:: + + TransformerEmbedder + TransformerLLM + TransformerReranker + TransformersClient diff --git a/docs/source/apis/components/_autosummary/components.model_client.utils.rst b/docs/source/apis/components/_autosummary/components.model_client.utils.rst new file mode 100644 index 00000000..7d7f919e --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.model_client.utils.rst @@ -0,0 +1,16 @@ +components.model\_client.utils +============================== + +.. automodule:: components.model_client.utils + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + parse_embedding_response diff --git a/docs/source/apis/components/_autosummary/components.output_parsers.outputs.rst b/docs/source/apis/components/_autosummary/components.output_parsers.outputs.rst new file mode 100644 index 00000000..797976da --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.output_parsers.outputs.rst @@ -0,0 +1,24 @@ +components.output\_parsers.outputs +================================== + +.. automodule:: components.output_parsers.outputs + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + BooleanOutputParser + JsonOutputParser + ListOutputParser + OutputParser + YamlOutputParser diff --git a/docs/source/apis/components/_autosummary/components.reasoning.chain_of_thought.rst b/docs/source/apis/components/_autosummary/components.reasoning.chain_of_thought.rst new file mode 100644 index 00000000..665486a7 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.reasoning.chain_of_thought.rst @@ -0,0 +1,4 @@ +components.reasoning.chain\_of\_thought +======================================= + +.. automodule:: components.reasoning.chain_of_thought diff --git a/docs/source/apis/components/_autosummary/components.retriever.bm25_retriever.rst b/docs/source/apis/components/_autosummary/components.retriever.bm25_retriever.rst new file mode 100644 index 00000000..6f869f55 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.retriever.bm25_retriever.rst @@ -0,0 +1,28 @@ +components.retriever.bm25\_retriever +==================================== + +.. automodule:: components.retriever.bm25_retriever + + + + + + + + .. rubric:: Functions + + .. autosummary:: + + split_text_by_word_fn + split_text_by_word_fn_then_lower_tokenized + split_text_tokenized + + + + + + .. rubric:: Classes + + .. autosummary:: + + BM25Retriever diff --git a/docs/source/apis/components/_autosummary/components.retriever.faiss_retriever.rst b/docs/source/apis/components/_autosummary/components.retriever.faiss_retriever.rst new file mode 100644 index 00000000..cad15914 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.retriever.faiss_retriever.rst @@ -0,0 +1,20 @@ +components.retriever.faiss\_retriever +===================================== + +.. automodule:: components.retriever.faiss_retriever + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + FAISSRetriever diff --git a/docs/source/apis/components/_autosummary/components.retriever.llm_retriever.rst b/docs/source/apis/components/_autosummary/components.retriever.llm_retriever.rst new file mode 100644 index 00000000..d8c9c6c6 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.retriever.llm_retriever.rst @@ -0,0 +1,20 @@ +components.retriever.llm\_retriever +=================================== + +.. automodule:: components.retriever.llm_retriever + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + LLMRetriever diff --git a/docs/source/apis/components/_autosummary/components.retriever.postgres_retriever.rst b/docs/source/apis/components/_autosummary/components.retriever.postgres_retriever.rst new file mode 100644 index 00000000..6a844203 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.retriever.postgres_retriever.rst @@ -0,0 +1,23 @@ +.. _components-retriever-postgres_retriever: + +components.retriever.postgres\_retriever +======================================== + +.. automodule:: components.retriever.postgres_retriever + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + DistanceToOperator + PostgresRetriever diff --git a/docs/source/apis/components/_autosummary/components.retriever.reranker_retriever.rst b/docs/source/apis/components/_autosummary/components.retriever.reranker_retriever.rst new file mode 100644 index 00000000..3231d4a6 --- /dev/null +++ b/docs/source/apis/components/_autosummary/components.retriever.reranker_retriever.rst @@ -0,0 +1,20 @@ +components.retriever.reranker\_retriever +======================================== + +.. automodule:: components.retriever.reranker_retriever + + + + + + + + + + + + .. rubric:: Classes + + .. autosummary:: + + RerankerRetriever diff --git a/docs/source/apis/components/components.agent.rst b/docs/source/apis/components/components.agent.rst new file mode 100644 index 00000000..3bea7354 --- /dev/null +++ b/docs/source/apis/components/components.agent.rst @@ -0,0 +1,27 @@ +.. _components-agent: + +components.agent +======================== + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.agent.react + + + +.. toctree:: + :maxdepth: 4 + + components.agent.react + + +--------------- + +.. automodule:: components.agent + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.data_process.data_components.rst b/docs/source/apis/components/components.data_process.data_components.rst new file mode 100644 index 00000000..c567436c --- /dev/null +++ b/docs/source/apis/components/components.data_process.data_components.rst @@ -0,0 +1,9 @@ +.. _components-data_process-data_components: + +components.data\_process.data\_components +================================================ + +.. automodule:: components.data_process.data_components + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.data_process.document_splitter.rst b/docs/source/apis/components/components.data_process.document_splitter.rst new file mode 100644 index 00000000..8d9c0f3a --- /dev/null +++ b/docs/source/apis/components/components.data_process.document_splitter.rst @@ -0,0 +1,9 @@ +.. _components-data_process-document_splitter: + +components.data\_process.document\_splitter +================================================== + +.. automodule:: components.data_process.document_splitter + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.data_process.rst b/docs/source/apis/components/components.data_process.rst new file mode 100644 index 00000000..1b444e9f --- /dev/null +++ b/docs/source/apis/components/components.data_process.rst @@ -0,0 +1,30 @@ +.. _components-data_process: + +components.data\_process +================================ + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.data_process.data_components + + components.data_process.text_splitter + + + +.. toctree:: + :maxdepth: 4 + + components.data_process.data_components + components.data_process.text_splitter + + +--------------- + +.. automodule:: components.data_process + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.data_process.text_splitter.rst b/docs/source/apis/components/components.data_process.text_splitter.rst new file mode 100644 index 00000000..a5ceea86 --- /dev/null +++ b/docs/source/apis/components/components.data_process.text_splitter.rst @@ -0,0 +1,9 @@ +.. _components-data_process-text_splitter: + +components.data\_process.text\_splitter +============================================== + +.. automodule:: components.data_process.text_splitter + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.memory.memory.rst b/docs/source/apis/components/components.memory.memory.rst new file mode 100644 index 00000000..d6ca5051 --- /dev/null +++ b/docs/source/apis/components/components.memory.memory.rst @@ -0,0 +1,9 @@ +.. _components-memory-memory: + +components.memory.memory +=============================== + +.. automodule:: components.memory.memory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.memory.rst b/docs/source/apis/components/components.memory.rst new file mode 100644 index 00000000..4505a185 --- /dev/null +++ b/docs/source/apis/components/components.memory.rst @@ -0,0 +1,27 @@ +.. _components-memory: + +components.memory +========================= + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.memory.memory + + + +.. toctree:: + :maxdepth: 4 + + components.memory.memory + + +--------------- + +.. automodule:: components.memory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.model_client.rst b/docs/source/apis/components/components.model_client.rst new file mode 100644 index 00000000..a8fb35d8 --- /dev/null +++ b/docs/source/apis/components/components.model_client.rst @@ -0,0 +1,45 @@ +.. _components-model_client: + +components.model\_client +================================ + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.model_client.anthropic_client + + components.model_client.cohere_client + + components.model_client.google_client + + components.model_client.groq_client + + components.model_client.openai_client + + components.model_client.transformers_client + + components.model_client.utils + + + +.. toctree:: + :maxdepth: 4 + + components.model_client.anthropic_client + components.model_client.cohere_client + components.model_client.google_client + components.model_client.groq_client + components.model_client.openai_client + components.model_client.transformers_client + components.model_client.utils + + +--------------- + +.. automodule:: components.model_client + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.model_client.utils.rst b/docs/source/apis/components/components.model_client.utils.rst new file mode 100644 index 00000000..53ae77d2 --- /dev/null +++ b/docs/source/apis/components/components.model_client.utils.rst @@ -0,0 +1,9 @@ +.. _components-model_client-utils: + +components.model\_client.utils +===================================== + +.. automodule:: components.model_client.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.output_parsers.rst b/docs/source/apis/components/components.output_parsers.rst new file mode 100644 index 00000000..729afabd --- /dev/null +++ b/docs/source/apis/components/components.output_parsers.rst @@ -0,0 +1,27 @@ +.. _components-output_parsers: + +components.output\_parsers +================================== + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.output_parsers.outputs + + + +.. toctree:: + :maxdepth: 4 + + components.output_parsers.outputs + + +--------------- + +.. automodule:: components.output_parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.reasoning.rst b/docs/source/apis/components/components.reasoning.rst new file mode 100644 index 00000000..6c79f3cd --- /dev/null +++ b/docs/source/apis/components/components.reasoning.rst @@ -0,0 +1,27 @@ +.. _components-reasoning: + +components.reasoning +============================ + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.reasoning.chain_of_thought + + + +.. toctree:: + :maxdepth: 4 + + components.reasoning.chain_of_thought + + +--------------- + +.. automodule:: components.reasoning + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.retriever.postgres_retriever.rst b/docs/source/apis/components/components.retriever.postgres_retriever.rst new file mode 100644 index 00000000..099bd303 --- /dev/null +++ b/docs/source/apis/components/components.retriever.postgres_retriever.rst @@ -0,0 +1,9 @@ +.. _components-retriever-postgres_retriever: + +components.retriever.postgres\_retriever +=============================================== + +.. automodule:: components.retriever.postgres_retriever + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/components.retriever.rst b/docs/source/apis/components/components.retriever.rst new file mode 100644 index 00000000..fdc4e1bd --- /dev/null +++ b/docs/source/apis/components/components.retriever.rst @@ -0,0 +1,39 @@ +.. _components-retriever: + +components.retriever +============================ + +Submodules +---------- +.. autosummary:: + :toctree: _autosummary + + + components.retriever.bm25_retriever + + components.retriever.faiss_retriever + + components.retriever.llm_retriever + + components.retriever.postgres_retriever + + components.retriever.reranker_retriever + + + +.. toctree:: + :maxdepth: 4 + + components.retriever.bm25_retriever + components.retriever.faiss_retriever + components.retriever.llm_retriever + components.retriever.postgres_retriever + components.retriever.reranker_retriever + + +--------------- + +.. automodule:: components.retriever + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/components/index.rst b/docs/source/apis/components/index.rst index 0fef9a70..24890fa4 100644 --- a/docs/source/apis/components/index.rst +++ b/docs/source/apis/components/index.rst @@ -1,3 +1,5 @@ +.. _apis-components: + Components ============== @@ -9,7 +11,8 @@ Overview components.agent components.model_client - + components.data_process + .. components.reasoning components.retriever @@ -37,6 +40,13 @@ Model Clients components.model_client +Data Process +---------------- +.. toctree:: + :maxdepth: 1 + + components.data_process + .. Embedders .. --------- .. .. toctree:: @@ -57,4 +67,3 @@ Retrievers :maxdepth: 1 components.retriever - diff --git a/docs/source/apis/core/core.base_data_class.rst b/docs/source/apis/core/core.base_data_class.rst new file mode 100644 index 00000000..8e629861 --- /dev/null +++ b/docs/source/apis/core/core.base_data_class.rst @@ -0,0 +1,9 @@ +.. _core-base_data_class: + +core.base\_data\_class +============================= + +.. automodule:: core.base_data_class + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.component.rst b/docs/source/apis/core/core.component.rst new file mode 100644 index 00000000..aab446e2 --- /dev/null +++ b/docs/source/apis/core/core.component.rst @@ -0,0 +1,9 @@ +.. _core-component: + +core.component +===================== + +.. automodule:: core.component + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.data_components.rst b/docs/source/apis/core/core.data_components.rst new file mode 100644 index 00000000..f75d1991 --- /dev/null +++ b/docs/source/apis/core/core.data_components.rst @@ -0,0 +1,9 @@ +.. _core-data_components: + +core.data\_components +============================ + +.. automodule:: core.data_components + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.db.rst b/docs/source/apis/core/core.db.rst new file mode 100644 index 00000000..ca74e130 --- /dev/null +++ b/docs/source/apis/core/core.db.rst @@ -0,0 +1,9 @@ +.. _core-db: + +core.db +============== + +.. automodule:: core.db + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.default_prompt_template.rst b/docs/source/apis/core/core.default_prompt_template.rst new file mode 100644 index 00000000..e9f538fa --- /dev/null +++ b/docs/source/apis/core/core.default_prompt_template.rst @@ -0,0 +1,9 @@ +.. _core-default_prompt_template: + +core.default\_prompt\_template +===================================== + +.. automodule:: core.default_prompt_template + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.document_splitter.rst b/docs/source/apis/core/core.document_splitter.rst new file mode 100644 index 00000000..48c1607a --- /dev/null +++ b/docs/source/apis/core/core.document_splitter.rst @@ -0,0 +1,9 @@ +.. _core-document_splitter: + +core.document\_splitter +============================== + +.. automodule:: core.document_splitter + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.embedder.rst b/docs/source/apis/core/core.embedder.rst new file mode 100644 index 00000000..195bb6c3 --- /dev/null +++ b/docs/source/apis/core/core.embedder.rst @@ -0,0 +1,9 @@ +.. _core-embedder: + +core.embedder +==================== + +.. automodule:: core.embedder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.func_tool.rst b/docs/source/apis/core/core.func_tool.rst new file mode 100644 index 00000000..9e3c5e3e --- /dev/null +++ b/docs/source/apis/core/core.func_tool.rst @@ -0,0 +1,9 @@ +.. _core-func_tool: + +core.func\_tool +====================== + +.. automodule:: core.func_tool + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.functional.rst b/docs/source/apis/core/core.functional.rst new file mode 100644 index 00000000..222c411c --- /dev/null +++ b/docs/source/apis/core/core.functional.rst @@ -0,0 +1,9 @@ +.. _core-functional: + +core.functional +====================== + +.. automodule:: core.functional + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.generator.rst b/docs/source/apis/core/core.generator.rst new file mode 100644 index 00000000..df0c8e38 --- /dev/null +++ b/docs/source/apis/core/core.generator.rst @@ -0,0 +1,9 @@ +.. _core-generator: + +core.generator +===================== + +.. automodule:: core.generator + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.memory.rst b/docs/source/apis/core/core.memory.rst new file mode 100644 index 00000000..178062ab --- /dev/null +++ b/docs/source/apis/core/core.memory.rst @@ -0,0 +1,9 @@ +.. _core-memory: + +core.memory +================== + +.. automodule:: core.memory + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.model_client.rst b/docs/source/apis/core/core.model_client.rst new file mode 100644 index 00000000..d4fb3a6d --- /dev/null +++ b/docs/source/apis/core/core.model_client.rst @@ -0,0 +1,9 @@ +.. _core-model_client: + +core.model\_client +========================= + +.. automodule:: core.model_client + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.parameter.rst b/docs/source/apis/core/core.parameter.rst new file mode 100644 index 00000000..467c4b33 --- /dev/null +++ b/docs/source/apis/core/core.parameter.rst @@ -0,0 +1,9 @@ +.. _core-parameter: + +core.parameter +===================== + +.. automodule:: core.parameter + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.prompt_builder.rst b/docs/source/apis/core/core.prompt_builder.rst new file mode 100644 index 00000000..247ae199 --- /dev/null +++ b/docs/source/apis/core/core.prompt_builder.rst @@ -0,0 +1,9 @@ +.. _core-prompt_builder: + +core.prompt\_builder +=========================== + +.. automodule:: core.prompt_builder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.retriever.rst b/docs/source/apis/core/core.retriever.rst new file mode 100644 index 00000000..2182485e --- /dev/null +++ b/docs/source/apis/core/core.retriever.rst @@ -0,0 +1,9 @@ +.. _core-retriever: + +core.retriever +===================== + +.. automodule:: core.retriever + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.string_parser.rst b/docs/source/apis/core/core.string_parser.rst new file mode 100644 index 00000000..e9527602 --- /dev/null +++ b/docs/source/apis/core/core.string_parser.rst @@ -0,0 +1,9 @@ +.. _core-string_parser: + +core.string\_parser +========================== + +.. automodule:: core.string_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.tokenizer.rst b/docs/source/apis/core/core.tokenizer.rst new file mode 100644 index 00000000..de663456 --- /dev/null +++ b/docs/source/apis/core/core.tokenizer.rst @@ -0,0 +1,9 @@ +.. _core-tokenizer: + +core.tokenizer +===================== + +.. automodule:: core.tokenizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.tool_manager.rst b/docs/source/apis/core/core.tool_manager.rst new file mode 100644 index 00000000..852c3265 --- /dev/null +++ b/docs/source/apis/core/core.tool_manager.rst @@ -0,0 +1,9 @@ +.. _core-tool_manager: + +core.tool\_manager +========================= + +.. automodule:: core.tool_manager + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/core.types.rst b/docs/source/apis/core/core.types.rst new file mode 100644 index 00000000..328324cb --- /dev/null +++ b/docs/source/apis/core/core.types.rst @@ -0,0 +1,9 @@ +.. _core-types: + +core.types +================= + +.. automodule:: core.types + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/core/index.rst b/docs/source/apis/core/index.rst index 1010b29b..c9f7399b 100644 --- a/docs/source/apis/core/index.rst +++ b/docs/source/apis/core/index.rst @@ -1,3 +1,5 @@ +.. _apis-core: + Core =================== @@ -8,23 +10,21 @@ Overview .. autosummary:: core.base_data_class - core.model_client core.component - core.data_components core.db core.default_prompt_template - core.document_splitter core.embedder core.functional core.generator core.memory + core.model_client core.parameter core.prompt_builder core.retriever core.string_parser - core.text_splitter core.tokenizer - core.tool_helper + core.func_tool + core.tool_manager core.types @@ -47,11 +47,9 @@ Data Handling .. toctree:: :maxdepth: 1 - core.base_data_class + core.base_data_class core.types - - core.data_components core.db Prompts and Templates @@ -62,10 +60,10 @@ Prompts and Templates core.default_prompt_template core.prompt_builder -Document Processing -------------------- -.. toctree:: - :maxdepth: 1 +.. Document Processing +.. ------------------- +.. .. toctree:: +.. :maxdepth: 1 .. core.document_splitter core.text_splitter @@ -87,18 +85,17 @@ Generation and Utilities core.functional core.memory -Parsing and Tokenization ------------------------ .. toctree:: :maxdepth: 1 core.string_parser core.tokenizer - core.tool_helper + core.func_tool Parameters ------------------------ .. toctree:: :maxdepth: 1 - core.parameter \ No newline at end of file + core.parameter diff --git a/docs/source/apis/eval/eval.answer_match_acc.rst b/docs/source/apis/eval/eval.answer_match_acc.rst new file mode 100644 index 00000000..ec068ca7 --- /dev/null +++ b/docs/source/apis/eval/eval.answer_match_acc.rst @@ -0,0 +1,9 @@ +.. _eval-answer_match_acc: + +eval.answer\_match\_acc +============================== + +.. automodule:: eval.answer_match_acc + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/eval/eval.evaluators.rst b/docs/source/apis/eval/eval.evaluators.rst new file mode 100644 index 00000000..d6ccf71b --- /dev/null +++ b/docs/source/apis/eval/eval.evaluators.rst @@ -0,0 +1,9 @@ +.. _eval-evaluators: + +eval.evaluators +====================== + +.. automodule:: eval.evaluators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/eval/eval.llm_as_judge.rst b/docs/source/apis/eval/eval.llm_as_judge.rst new file mode 100644 index 00000000..e9da7032 --- /dev/null +++ b/docs/source/apis/eval/eval.llm_as_judge.rst @@ -0,0 +1,9 @@ +.. _eval-llm_as_judge: + +eval.llm\_as\_judge +========================== + +.. automodule:: eval.llm_as_judge + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/eval/eval.retriever_recall.rst b/docs/source/apis/eval/eval.retriever_recall.rst new file mode 100644 index 00000000..1e1b7f4d --- /dev/null +++ b/docs/source/apis/eval/eval.retriever_recall.rst @@ -0,0 +1,9 @@ +.. _eval-retriever_recall: + +eval.retriever\_recall +============================= + +.. automodule:: eval.retriever_recall + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/eval/eval.retriever_relevance.rst b/docs/source/apis/eval/eval.retriever_relevance.rst new file mode 100644 index 00000000..737de569 --- /dev/null +++ b/docs/source/apis/eval/eval.retriever_relevance.rst @@ -0,0 +1,9 @@ +.. _eval-retriever_relevance: + +eval.retriever\_relevance +================================ + +.. automodule:: eval.retriever_relevance + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/eval/index.rst b/docs/source/apis/eval/index.rst index 966c01a1..d3b09e39 100644 --- a/docs/source/apis/eval/index.rst +++ b/docs/source/apis/eval/index.rst @@ -1,3 +1,5 @@ +.. _apis-eval: + Evaluation ============== diff --git a/docs/source/apis/index.rst b/docs/source/apis/index.rst index 548a2e90..6b4af1d2 100644 --- a/docs/source/apis/index.rst +++ b/docs/source/apis/index.rst @@ -17,7 +17,6 @@ The core section of the LightRAG API documentation provides detailed information core.data_components core.db core.default_prompt_template - core.document_splitter core.embedder core.functional core.generator @@ -26,9 +25,9 @@ The core section of the LightRAG API documentation provides detailed information core.prompt_builder core.retriever core.string_parser - core.text_splitter core.tokenizer - core.tool_helper + core.func_tool + core.tool_manager core.types @@ -41,9 +40,9 @@ The components section of the LightRAG API documentation outlines the detailed s components.agent components.model_client - + componnets.data_process .. components.reasoning - + components.retriever components.output_parsers @@ -126,4 +125,4 @@ Utils :maxdepth: 2 :hidden: - utils/index \ No newline at end of file + utils/index diff --git a/docs/source/apis/optim/index.rst b/docs/source/apis/optim/index.rst index 74894349..3e2f23d8 100644 --- a/docs/source/apis/optim/index.rst +++ b/docs/source/apis/optim/index.rst @@ -1,3 +1,5 @@ +.. _apis-optim: + .. Optimizer .. ============== @@ -22,4 +24,4 @@ Optimizer optim.sampler optim.few_shot_optimizer optim.llm_augment - optim.llm_optimizer \ No newline at end of file + optim.llm_optimizer diff --git a/docs/source/apis/optim/optim.few_shot_optimizer.rst b/docs/source/apis/optim/optim.few_shot_optimizer.rst new file mode 100644 index 00000000..352302ea --- /dev/null +++ b/docs/source/apis/optim/optim.few_shot_optimizer.rst @@ -0,0 +1,9 @@ +.. _optim-few_shot_optimizer: + +optim.few\_shot\_optimizer +================================= + +.. automodule:: optim.few_shot_optimizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/optim/optim.llm_augment.rst b/docs/source/apis/optim/optim.llm_augment.rst new file mode 100644 index 00000000..ce88ae79 --- /dev/null +++ b/docs/source/apis/optim/optim.llm_augment.rst @@ -0,0 +1,9 @@ +.. _optim-llm_augment: + +optim.llm\_augment +========================= + +.. automodule:: optim.llm_augment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/optim/optim.llm_optimizer.rst b/docs/source/apis/optim/optim.llm_optimizer.rst new file mode 100644 index 00000000..981ca228 --- /dev/null +++ b/docs/source/apis/optim/optim.llm_optimizer.rst @@ -0,0 +1,9 @@ +.. _optim-llm_optimizer: + +optim.llm\_optimizer +=========================== + +.. automodule:: optim.llm_optimizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/optim/optim.optimizer.rst b/docs/source/apis/optim/optim.optimizer.rst new file mode 100644 index 00000000..2a0e799a --- /dev/null +++ b/docs/source/apis/optim/optim.optimizer.rst @@ -0,0 +1,9 @@ +.. _optim-optimizer: + +optim.optimizer +====================== + +.. automodule:: optim.optimizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/optim/optim.sampler.rst b/docs/source/apis/optim/optim.sampler.rst new file mode 100644 index 00000000..207a4cb6 --- /dev/null +++ b/docs/source/apis/optim/optim.sampler.rst @@ -0,0 +1,9 @@ +.. _optim-sampler: + +optim.sampler +==================== + +.. automodule:: optim.sampler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/tracing/index.rst b/docs/source/apis/tracing/index.rst index 66ffceef..7a266ecd 100644 --- a/docs/source/apis/tracing/index.rst +++ b/docs/source/apis/tracing/index.rst @@ -1,3 +1,5 @@ +.. _apis-tracing: + Tracing ============== @@ -22,4 +24,4 @@ Loggers :maxdepth: 1 tracing.generator_state_logger - tracing.generator_call_logger \ No newline at end of file + tracing.generator_call_logger diff --git a/docs/source/apis/tracing/tracing.decorators.rst b/docs/source/apis/tracing/tracing.decorators.rst new file mode 100644 index 00000000..688c1786 --- /dev/null +++ b/docs/source/apis/tracing/tracing.decorators.rst @@ -0,0 +1,9 @@ +.. _tracing-decorators: + +tracing.decorators +========================= + +.. automodule:: tracing.decorators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/tracing/tracing.generator_call_logger.rst b/docs/source/apis/tracing/tracing.generator_call_logger.rst new file mode 100644 index 00000000..db70d6b6 --- /dev/null +++ b/docs/source/apis/tracing/tracing.generator_call_logger.rst @@ -0,0 +1,9 @@ +.. _tracing-generator_call_logger: + +tracing.generator\_call\_logger +====================================== + +.. automodule:: tracing.generator_call_logger + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/tracing/tracing.generator_state_logger.rst b/docs/source/apis/tracing/tracing.generator_state_logger.rst new file mode 100644 index 00000000..1b562a96 --- /dev/null +++ b/docs/source/apis/tracing/tracing.generator_state_logger.rst @@ -0,0 +1,9 @@ +.. _tracing-generator_state_logger: + +tracing.generator\_state\_logger +======================================= + +.. automodule:: tracing.generator_state_logger + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/index.rst b/docs/source/apis/utils/index.rst index 4f13a5c5..fb339773 100644 --- a/docs/source/apis/utils/index.rst +++ b/docs/source/apis/utils/index.rst @@ -1,3 +1,5 @@ +.. _apis-utils: + Utils ============================= @@ -31,4 +33,3 @@ Setup_env :maxdepth: 1 utils.setup_env - diff --git a/docs/source/apis/utils/utils.config.rst b/docs/source/apis/utils/utils.config.rst new file mode 100644 index 00000000..a786dc7e --- /dev/null +++ b/docs/source/apis/utils/utils.config.rst @@ -0,0 +1,9 @@ +.. _utils-config: + +utils.config +=================== + +.. automodule:: utils.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.file_io.rst b/docs/source/apis/utils/utils.file_io.rst new file mode 100644 index 00000000..0b3ffb8b --- /dev/null +++ b/docs/source/apis/utils/utils.file_io.rst @@ -0,0 +1,9 @@ +.. _utils-file_io: + +utils.file\_io +===================== + +.. automodule:: utils.file_io + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.lazy_import.rst b/docs/source/apis/utils/utils.lazy_import.rst new file mode 100644 index 00000000..b76130ea --- /dev/null +++ b/docs/source/apis/utils/utils.lazy_import.rst @@ -0,0 +1,9 @@ +.. _utils-lazy_import: + +utils.lazy\_import +========================= + +.. automodule:: utils.lazy_import + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.logger.rst b/docs/source/apis/utils/utils.logger.rst new file mode 100644 index 00000000..4d9d9d42 --- /dev/null +++ b/docs/source/apis/utils/utils.logger.rst @@ -0,0 +1,9 @@ +.. _utils-logger: + +utils.logger +=================== + +.. automodule:: utils.logger + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.registry.rst b/docs/source/apis/utils/utils.registry.rst new file mode 100644 index 00000000..77a8095e --- /dev/null +++ b/docs/source/apis/utils/utils.registry.rst @@ -0,0 +1,9 @@ +.. _utils-registry: + +utils.registry +===================== + +.. automodule:: utils.registry + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.serialization.rst b/docs/source/apis/utils/utils.serialization.rst new file mode 100644 index 00000000..61980a68 --- /dev/null +++ b/docs/source/apis/utils/utils.serialization.rst @@ -0,0 +1,9 @@ +.. _utils-serialization: + +utils.serialization +========================== + +.. automodule:: utils.serialization + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apis/utils/utils.setup_env.rst b/docs/source/apis/utils/utils.setup_env.rst new file mode 100644 index 00000000..dc669ad5 --- /dev/null +++ b/docs/source/apis/utils/utils.setup_env.rst @@ -0,0 +1,9 @@ +.. _utils-setup_env: + +utils.setup\_env +======================= + +.. automodule:: utils.setup_env + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 986d5c25..77ae00ff 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,27 +15,18 @@ sys.path.insert(0, os.path.abspath("../../")) -sys.path.insert(0, os.path.abspath("../../lightrag")) +sys.path.insert(0, os.path.abspath("../../lightrag/lightrag")) # # need to insert the paths # for dir in os.walk('../../lightrag'): # sys.path.insert(0, dir[0]) # # print(dir[0]) -import lightrag - -import lightrag.components -import lightrag.core -import lightrag.eval -import lightrag.utils -import lightrag.tracing -import lightrag.optim # -- Project information ----------------------------------------------------- project = "LightRAG" -copyright = "2024, SylphAI" -author = "SylphAI" - +copyright = "2024, SylphAI, Inc" +author = "SylphAI, Inc" # -- General configuration --------------------------------------------------- @@ -56,13 +47,15 @@ "sphinx_design", "sphinx_copybutton", "nbsphinx", - "sphinx_search.extension" + "sphinx_search.extension", # "myst_nb", # "sphinx.builders.changes", # 'recommonmark', # 'myst_parser' ] +html_show_sphinx = False + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -135,4 +128,6 @@ def setup(app): - app.add_css_file("css/custom.css") # Add custom CSS file to the Sphinx configuration + app.add_css_file( + "css/custom.css" + ) # Add custom CSS file to the Sphinx configuration diff --git a/docs/source/developer_notes/agent.rst b/docs/source/developer_notes/agent.rst index a611bf52..0c9b6834 100644 --- a/docs/source/developer_notes/agent.rst +++ b/docs/source/developer_notes/agent.rst @@ -1,2 +1,26 @@ Agent -==================== \ No newline at end of file +==================== + +.. epigraph:: + + “An autonomous agent is a system situated within and a part of an environment that senses that environment and acts on it, over time, in pursuit of its own agenda and so as to effect what it senses in the future.” + + -- Franklin and Graesser (1997) + +Agents are LLM-based and themselves belong to another popular family of LLM applications besides of the well-known RAGs. +The key on Agents are their ability to reasoning, plannning, and acting via accessible tools. +In LightRAG, agents are simply a generator which can use tools, take multiple steps(sequential or parallel ) to complete a user query. + + + +.. admonition:: References + :class: highlight + + 1. A survey on large language model based autonomous agents: https://github.com/Paitesanshi/LLM-Agent-Survey + 2. ReAct: https://arxiv.org/abs/2210.03629 + + +.. admonition:: API References + :class: highlight + + - :class:`components.agent.react.ReactAgent` diff --git a/docs/source/developer_notes/base_data_class.rst b/docs/source/developer_notes/base_data_class.rst index 8ba75bfd..b06125ef 100644 --- a/docs/source/developer_notes/base_data_class.rst +++ b/docs/source/developer_notes/base_data_class.rst @@ -7,232 +7,569 @@ DataClass `Li Yin `_ -In PyTorch, ``Tensor`` is the data type used in ``Module`` and ``Optimizer`` across the library. -The data in particular is a multi-dimensional matrix such as such as weights, biases, and even inputs and predictions. -In LLM applications, you can think of the data as a freeform data class with various fields and types of data. -For instance: +In `PyTorch`, ``Tensor`` is the data type used in ``Module`` and ``Optimizer`` across the library. +Tensor wraps a multi-dimensional matrix to better support its operations and computations. +In LLM applications, data constantly needs to interact with LLMs in the form of strings via prompt and be parsed back to structured data from LLMs' text prediction. +:class:`core.base_data_class.DataClass` is designed to ease the data interaction with LLMs via prompt(input) and text prediction(output). + +.. figure:: /_static/images/dataclass.png + :align: center + :alt: DataClass + :width: 680px + + DataClass is to ease the data interaction with LLMs via prompt(input) and text prediction(output). + + +Design +---------------- +In Python, data is typically represented as a class with attributes. +To interact with LLM, we need great way to describe the data format and the data instance to LLMs and be able to convert back to data instance from the text prediction. +This overlaps with the serialization and deserialization of the data in the conventional programming. +Packages like ``Pydantic`` or ``Marshmallow`` can covers the seralization and deserialization, but it will end up with more complexity and less transparency to users. +LLM prompts are known to be sensitive, the details, controllability, and transparency of the data format are crucial here. + +We eventually created a base class :class:`core.base_data_class.DataClass` to handle data that will interact with LLMs, which builds on top of Python's native ``dataclasses`` module. +Here are our reasoning: + +1. ``dataclasses`` module is lightweight, flexible, and is already widely used in Python for data classes. +2. Using ``field`` (`metadata`, `default`, `default_factory`) in `dataclasses` adds more ways to describe the data. +3. ``asdict()`` from `dataclasses` is already good at converting a data class instance to a dictionary for serialization. +4. Getting data class schmea for data class is feasible. + + +Here is how users typically use the ``dataclasses`` module: .. code-block:: python - from dataclasses import dataclass + from dataclasses import dataclass, field @dataclass class TrecData: - question: str - label: int + question: str = field( + metadata={"desc": "The question asked by the user"} + ) # Required field, you have to provide the question field at the instantiation + label: int = field( + metadata={"desc": "The label of the question"}, default=0 + ) # Optional field -It is exactly a single input data item in a typical PyTorch ``Dataset`` or a `HuggingFace` ``Dataset``. -The unique thing is all data or tools interact with LLMs via prompt and text prediction, which is a single ``str``. +``DataClass`` covers the following: -Most existing libraries use `Pydantic` to handle the serialization(convert to string) and deserialization(convert from string) of the data. -But, in LightRAG, we in particular designed :class:`core.base_data_class.DataClass` using native `dataclasses` module. -The reasons are: +1. Generate the class ``schema`` and ``signature`` (less verbose) to describe the data format to LLMs. +2. Convert the data instance to a json or yaml string to show the data example to LLMs. +3. Load the data instance from a json or yaml string to get the data instance back to be processed in the program. -1. ``dataclasses`` module's `dataclass` decorator, along with `field` (`metadata`, `default`) can be especially helpful to describe the data format to LLMs. `dataclass` also saves users time on writing the boilerplate code such as `__init__`, `__repr__`, `__str__` etc. +We also made the effort to provide more control: -2. `dataclasses` native module is more lightweight, flexible, and user-friendly than `Pydantic`. +1. **Keep the ordering of your data fields.** We provided :func:`core.base_data_class.required_field` with ``default_factory`` to mark the field as required even if it is after optional fields. We also has to do customization to preserve their ordering while being converted to dictionary, json and yaml string. +2. **Exclude some fields from the output.** All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses. +3. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries. -3. Though we need more customization on ``BaseClass`` compared with directly using `Pydantic`, we will enjoy more transparency and control over the data format. -Here is how users can define a data class with our customized methods in LightRAG: +Describing the Data Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: python +.. list-table:: + :header-rows: 1 + :widths: 40 70 + + * - **Name** + - **Description** + * - ``to_schema(cls, exclude) -> Dict`` + - Generate a JSON schema which is more detailed than the signature. + * - ``to_schema_str(cls, exclude) -> str`` + - Generate a JSON schema string which is more detailed than the signature. + * - ``to_yaml_signature(cls, exclude) -> str`` + - Generate a YAML signature for the class from descriptions in metadata. + * - ``to_json_signature(cls, exclude) -> str`` + - Generate a JSON signature (JSON string) for the class from descriptions in metadata. + * - ``format_class_str(cls, format_type, exclude) -> str`` + - Generate data format string, covers ``to_schema_str``, ``to_yaml_signature``, and ``to_json_signature``. + +Work with Data Instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - from lightrag.core.base_data_class import ( - DataClass, - required_field, - ) - from dataclasses import field +.. list-table:: + :header-rows: 1 + :widths: 40 70 + + * - **Name** + - **Description** + * - ``from_dict(cls, data: Dict) -> "DataClass"`` + - Create a dataclass instance from a dictionary. Supports nested dataclasses, lists, and dictionaries. + * - ``to_dict(self, exclude: ExcludeType) -> Dict`` + - Convert a dataclass object to a dictionary. Supports nested dataclasses, lists, and dictionaries. Allows exclusion of specific fields. + * - ``to_json_obj(self, exclude: ExcludeType) -> Any`` + - Convert the dataclass instance to a JSON object, maintaining the order of fields. + * - ``to_json(self, exclude: ExcludeType) -> str`` + - Convert the dataclass instance to a JSON string, maintaining the order of fields. + * - ``to_yaml_obj(self, exclude: ExcludeType) -> Any`` + - Convert the dataclass instance to a YAML object, maintaining the order of fields. + * - ``to_yaml(self, exclude: ExcludeType) -> str`` + - Convert the dataclass instance to a YAML string, maintaining the order of fields. + * - ``from_json(cls, json_str: str) -> "DataClass"`` + - Create a dataclass instance from a JSON string. + * - ``from_yaml(cls, yaml_str: str) -> "DataClass"`` + - Create a dataclass instance from a YAML string. + * - ``format_example_str(self, format_type, exclude) -> str`` + - Generate data examples string, covers ``to_json`` and ``to_yaml``. + +We have :class:`core.base_data_class.DataClassFormatType` to specify the format type for the data format methods. +.. note:: - class MyOutputs(DataClass): - name: str = field( - default="John Doe", # Optional field - metadata={"desc": "The name of the person", "prefix": "Name:"}, - ) - age: int = field( - default_factory=required_field, # Required field - metadata={"desc": "The age of the person", "prefix": "Age:"}, - ) + To use ``DataClass``, you have to decorate your class with the ``dataclass`` decorator from the ``dataclasses`` module. -.. note:: +.. in Python is a decorator that can be used to automatically generate special methods such as `__init__`, `__repr__`, `__str__` etc. for a class. - `required_field` is a helper function to mark the field as required. Otherwise, using either `default` or `default_factory` will make the field optional. +.. .. code-block:: python -.. Now, let's see how we design class and instance methods to describe the data format and the data instance to LLMs. +.. from dataclasses import dataclass +.. @dataclass +.. class TrecData: +.. question: str +.. label: int -Describe data to LLMs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Data Format -^^^^^^^^^^^^^^^^^^^^^^^^^ +.. It is exactly a single input data item in a typical PyTorch ``Dataset`` or a `HuggingFace` ``Dataset``. +.. The unique thing is all data or tools interact with LLMs via prompt and text prediction, which is a single ``str``. + +.. Most existing libraries use `Pydantic` to handle the serialization(convert to string) and deserialization(convert back from string) of the data. +.. But, in LightRAG, we in particular designed :class:`core.base_data_class.DataClass` using native `dataclasses` module. +.. The reasons are: -We need to describe either the input/output data format to give LLMs context on how to understand the input data and to generate the output data. +.. 1. ``dataclasses`` module's `dataclass` decorator, along with `field` (`metadata`, `default`) can be especially helpful to describe the data format to LLMs. `dataclass` also saves users time on writing the boilerplate code such as `__init__`, `__repr__`, `__str__` etc. -What we want to let LLM know about our input/output data format: -In particular, it is important for LLMs to know these five things about the data format: +.. 2. `dataclasses` native module is more lightweight, flexible, and user-friendly than `Pydantic`. -1. **Description** of what this field is for. We use `desc` key in the `metadata` of `field` to describe this field. Example: +.. 3. Though we need more customization on ``BaseClass`` compared with directly using `Pydantic`, we will enjoy more transparency and control over the data format. +DataClass in Action +------------------------ +Say you have a few of ``TrecData`` structued as follows that you want to engage with LLMs: .. code-block:: python - thought: str = field( - metadata={"desc": "The reasoning or thought behind the question."} - ) + from dataclasses import dataclass, field -2. **Required/Optional**. We use either `default` or `default_factory` to mark the field as optional except when our specialized function :func:`core.base_data_class.required_field` is used in `default_factory`, which marks the field as required. -3. **Field Data Type** such as `str`, `int`, `float`, `bool`, `List`, `Dict`, etc. -4. **Order of the fields** matter as in a typical Chain of Thought, we want the reasoning/thought field to be in the output ahead of the answer. -5. The ablility to **exclude** some fields from the output. - -We provide two ways: (1) ``schema`` and (2) ``signature`` to describe the data format in particular. + @dataclass + class Question: + question: str = field( + metadata={"desc": "The question asked by the user"} + ) + metadata: dict = field( + metadata={"desc": "The metadata of the question"}, default_factory=dict + ) -**Schema** + @dataclass + class TrecData: + question: Question = field( + metadata={"desc": "The question asked by the user"} + ) # Required field, you have to provide the question field at the instantiation + label: int = field( + metadata={"desc": "The label of the question"}, default=0 + ) # Optional field + +Describe the data format to LLMs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We will create ``TrecData2`` class that subclasses from `DataClass`. +You decide to add a field ``metadata`` to the ``TrecData`` class to store the metadata of the question. +For your own reason, you want ``metadata`` to be a required field and you want to keep the ordering of your fields while being converted to strings. +``DataClass`` will help you achieve this using :func:`core.base_data_class.required_field` on the `default_factory` of the field. +Normally, this is not possible with the native `dataclasses` module as it will raise an error if you put a required field after an optional field. -``schema`` will be a dict or json string and it is more verbose compared with ``signature``. -``signature`` imitates the exact data format (`yaml` or `json`) that you want LLMs to generate. +.. note:: -Here is a quick example on our ``schema`` for the ``MyOutputs`` data class using the `to_data_class_schema` method: + **Order of the fields** matter as in a typical Chain of Thought, we want the reasoning/thought field to be in the output ahead of the answer. .. code-block:: python - MyOutputs.to_data_class_schema() + from lightrag.core import DataClass, required_field -The output will be a dict: + @dataclass + class TrecData2(DataClass): + question: Question = field( + metadata={"desc": "The question asked by the user"} + ) # Required field, you have to provide the question field at the instantiation + label: int = field( + metadata={"desc": "The label of the question"}, default=0 + ) # Optional field + metadata: dict = field( + metadata={"desc": "The metadata of the question"}, default_factory=required_field() + ) # required field -.. code-block:: json +**Schema** - { - "name": { - "type": "str", - "desc": "The name of the person", - "required": false - }, - "age": { - "type": "int", - "desc": "The age of the person", - "required": true - } - } +Now, let us see the schema of the ``TrecData2`` class: + +.. code-block:: python -You can use `to_data_class_schema_str` to have the json string output. + print(TrecData2.to_schema()) -In comparison with the schema used in other libraries: +The output will be: -.. code-block:: json +.. code-block:: { + "type": "TrecData2", "properties": { - "name": { - "title": "Name", - "description": "The name of the user", - "default": "John Doe", - "type": "string", - }, - "age": { - "title": "Age", - "description": "The age of the user", - "type": "integer", + "question": { + "type": "{'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}", + "desc": "The question asked by the user", }, + "label": {"type": "int", "desc": "The label of the question"}, + "metadata": {"type": "dict", "desc": "The metadata of the question"}, }, - "required": ["age"], + "required": ["question", "metadata"], } -Even our ``schema`` is more token efficient as you can see. We opted out of the `default` field as it is more of a fallback value in the program -rather than a description of the data format to LLMs. +As you can see, it handles the nested dataclass `Question` and the required field `metadata` correctly. -.. note:: - If you use ``schema`` (json string) to instruct LLMs to output `yaml` data, the LLMs might get confused and can potentially output `json` data instead. +.. note:: + + ``Optional`` type hint will not affect the field's required status. You can use this to work with static type checkers such as `mypy` if you want to. **Signature** -``signature`` is a string that imitates the exact data format (here we support `yaml` or `json`) that you want LLMs to generate. - -Let's use class methods ``to_json_signature`` and ``to_yaml_signature`` to generate the signature for the ``MyOutputs`` data class: +As schema can be rather verbose, and sometimes it works better to be more concise, and to mimick the output data structure that you want. +Say, you want LLM to generate a ``yaml`` or ``json`` string and later you can convert it back to a dictionary or even your data instance. +We can do so using the signature: .. code-block:: python - print(MyOutputs.to_json_signature()) - print(MyOutputs.to_yaml_signature()) + print(TrecData2.to_json_signature()) The json signature output will be: -.. code-block:: json +.. code-block:: { - "name": "The name of the person (str) (optional)", - "age": "The age of the person (int) (required)" + "question": "The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)", + "label": "The label of the question (int) (optional)", + "metadata": "The metadata of the question (dict) (required)" } -The yaml signature output will be: +To yaml signature: -.. code-block:: yaml +.. code-block:: - name: The name of the person (str) (optional) - age: The age of the person (int) (required) + question: The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required) + label: The label of the question (int) (optional) + metadata: The metadata of the question (dict) (required) -All of the above methods support `exclude` parameter to exclude some fields from the output. +.. note:: + + If you use ``schema`` (json string) to instruct LLMs to output `yaml` data, the LLMs might get confused and can potentially output `json` data instead. -Data Instance or say Example -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To better demonstrate either the data format or provide examples seen in few-shot In-context learning, -we provide two methods: `to_json` and `to_yaml` to convert the data instance to json or yaml string. +**Exclude** -First, let's create an instance of the `MyOutputs` and get the json and yaml string of the instance: +Now, if you decide to not show some fields in the output, you can use the `exclude` parameter in the methods. +Let's exclude both the ``metadata`` from class ``TrecData2`` and the ``metadata`` from class ``Question``: .. code-block:: python - instance = MyOutputs(name="Jane Doe", age=25) - print(instance.to_json()) - print(instance.to_yaml()) + json_signature_exclude = TrecData2.to_json_signature(exclude={"TrecData2": ["metadata"], "Question": ["metadata"]}) + print(json_signature_exclude) -The json output will be: +The output will be: -.. code-block:: json +.. code-block:: { - "name": "Jane Doe", - "age": 25 + "question": "The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}}, 'required': ['question']}) (required)", + "label": "The label of the question (int) (optional)" } -You can use `json.loads` to convert the json string back to a dictionary. -The yaml output will be: +If you only want to exclude the ``metadata`` from class ``TrecData2``- the outer class, you can pass a list of strings simply: -.. code-block:: yaml +.. code-block:: python - name: "John Doe" - age: 25 + json_signature_exclude = TrecData2.to_json_signature(exclude=["metadata"]) + print(json_signature_exclude) -You can use `yaml.safe_load` to convert the yaml string back to a dictionary. +The output will be: +.. code-block:: + { + "question": "The question asked by the user ({'type': 'Question', 'properties': {'question': {'type': 'str', 'desc': 'The question asked by the user'}, 'metadata': {'type': 'dict', 'desc': 'The metadata of the question'}}, 'required': ['question']}) (required)", + "label": "The label of the question (int) (optional)" + } +The ``exclude`` parameter works the same across all methods. + +**DataClassFormatType** + +For data class format, we have :class:``core.base_data_class.DataClassFormatType`` along with ``format_class_str`` method to specify the format type for the data format methods. + +.. code-block:: python + + from lightrag.core import DataClassFormatType + + json_signature = TrecData2.format_class_str(DataClassFormatType.SIGNATURE_JSON) + print(json_signature) + + yaml_signature = TrecData2.format_class_str(DataClassFormatType.SIGNATURE_YAML) + print(yaml_signature) + + schema = TrecData2.format_class_str(DataClassFormatType.SCHEMA) + print(schema) + +.. Describe data to LLMs +.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. Data Format +.. ^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. We need to describe either the input/output data format to give LLMs context on how to understand the input data and to generate the output data. + +.. What we want to let LLM know about our input/output data format: +.. In particular, it is important for LLMs to know these five things about the data format: + +.. 1. **Description** of what this field is for. We use `desc` key in the `metadata` of `field` to describe this field. Example: + +.. .. code-block:: python + +.. thought: str = field( +.. metadata={"desc": "The reasoning or thought behind the question."} +.. ) + +.. 2. **Required/Optional**. We use either `default` or `default_factory` to mark the field as optional except when our specialized function :func:`core.base_data_class.required_field` is used in `default_factory`, which marks the field as required. +.. 3. **Field Data Type** such as `str`, `int`, `float`, `bool`, `List`, `Dict`, etc. +.. 4. **Order of the fields** matter as in a typical Chain of Thought, we want the reasoning/thought field to be in the output ahead of the answer. +.. 5. The ablility to **exclude** some fields from the output. + +.. We provide two ways: (1) ``schema`` and (2) ``signature`` to describe the data format in particular. + +.. **Schema** + +.. ``schema`` will be a dict or json string and it is more verbose compared with ``signature``. +.. ``signature`` imitates the exact data format (`yaml` or `json`) that you want LLMs to generate. + +.. Here is a quick example on our ``schema`` for the ``MyOutputs`` data class using the `to_schema` method: + +.. .. code-block:: python + +.. MyOutputs.to_schema() + +.. The output will be a dict: + +.. .. code-block:: json + +.. { +.. "name": { +.. "type": "str", +.. "desc": "The name of the person", +.. "required": false +.. }, +.. "age": { +.. "type": "int", +.. "desc": "The age of the person", +.. "required": true +.. } +.. } + +.. You can use `to_schema_str` to have the json string output. + +.. In comparison with the schema used in other libraries: + +.. .. code-block:: json + +.. { +.. "properties": { +.. "name": { +.. "title": "Name", +.. "description": "The name of the user", +.. "default": "John Doe", +.. "type": "string", +.. }, +.. "age": { +.. "title": "Age", +.. "description": "The age of the user", +.. "type": "integer", +.. }, +.. }, +.. "required": ["age"], +.. } + +.. Even our ``schema`` is more token efficient as you can see. We opted out of the `default` field as it is more of a fallback value in the program +.. rather than a description of the data format to LLMs. -Load data from dataset as example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -As we need to load or create an instance from a dataset, which is typically from Pytorch dataset or huggingface dataset and each data point is in -the form of a dictionary. -Let's create an instance of the `MyOutputs` from a dictionary: + +.. **Signature** + +.. ``signature`` is a string that imitates the exact data format (here we support `yaml` or `json`) that you want LLMs to generate. + +.. Let's use class methods ``to_json_signature`` and ``to_yaml_signature`` to generate the signature for the ``MyOutputs`` data class: + +.. .. code-block:: python + +.. print(MyOutputs.to_json_signature()) +.. print(MyOutputs.to_yaml_signature()) + +.. The json signature output will be: + +.. .. code-block:: json + +.. { +.. "name": "The name of the person (str) (optional)", +.. "age": "The age of the person (int) (required)" +.. } + +.. The yaml signature output will be: + +.. .. code-block:: yaml + +.. name: The name of the person (str) (optional) +.. age: The age of the person (int) (required) + +.. All of the above methods support `exclude` parameter to exclude some fields from the output. + +Show data examples & parse string to data instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Our functionality on data instance will help you show data examples to LLMs. +This is mainly done via ``to_dict`` method, which you can further convert to json or yaml string. +To convert the raw string back to the data instance, either from json or yaml string, we leverage class method ``from_dict``. +So it is important for ``DataClass`` to be able to ensure the reconstructed data instance is the same as the original data instance. +Here is how you can do it with a ``DataClass`` subclass: + +.. code-block:: python + + example = TrecData2(Question("What is the capital of France?"), 1, {"key": "value"}) + print(example) + + dict_example = example.to_dict() + print(dict_example) + + reconstructed = TrecData2.from_dict(dict_example) + print(reconstructed) + + print(reconstructed == example) + +The output will be: + +.. code-block:: python + + TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'}) + {'question': {'question': 'What is the capital of France?', 'metadata': {}}, 'label': 1, 'metadata': {'key': 'value'}} + TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'}) + True + +On top of ``from_dict`` and ``to_dict``, we make sure you can also directly work with: + +* ``from_yaml`` (from yaml string to reconstruct instance) and ``to_yaml`` (a yaml string) +* ``from_json`` (from json string to reconstruct instance) and ``to_json`` (a json string) + +Here is how it works with ``DataClass`` subclass: + +.. code-block:: python + + json_str = example.to_json() + print(json_str) + + yaml_str = example.to_yaml(example) + print(yaml_str) + + reconstructed_from_json = TrecData2.from_json(json_str) + print(reconstructed_from_json) + print(reconstructed_from_json == example) + + reconstructed_from_yaml = TrecData2.from_yaml(yaml_str) + print(reconstructed_from_yaml) + print(reconstructed_from_yaml == example) + +The output will be: + +.. code-block:: + + { + "question": { + "question": "What is the capital of France?", + "metadata": {} + }, + "label": 1, + "metadata": { + "key": "value" + } + } + question: + question: What is the capital of France? + metadata: {} + label: 1 + metadata: + key: value + + TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'}) + True + TrecData2(question=Question(question='What is the capital of France?', metadata={}), label=1, metadata={'key': 'value'}) + True + + +Similarly, (1) all ``to_dict``, ``to_json``, and ``to_yaml`` works with `exclude` parameter to exclude some fields from the output, +(2) you can use ``DataClassFormatType`` along with ``format_example_str`` method to specify the format type for the data example methods. .. code-block:: python - data = {"name": "Jane Doe", "age": 25} - print(MyOutputs.from_dict(data)) + from lightrag.core import DataClassFormatType + + example_str = example.format_example_str(DataClassFormatType.EXAMPLE_JSON) + print(example_str) + + example_str = example.format_example_str(DataClassFormatType.EXAMPLE_YAML) + print(example_str) + + +.. Let's create an instance of ``TrecData2`` and get the json and yaml string of the instance: + + + +.. To better demonstrate either the data format or provide examples seen in few-shot In-context learning, +.. we provide two methods: `to_json` and `to_yaml` to convert the data instance to json or yaml string. + +.. First, let's create an instance of the `MyOutputs` and get the json and yaml string of the instance: + +.. .. code-block:: python + +.. instance = MyOutputs(name="Jane Doe", age=25) +.. print(instance.to_json()) +.. print(instance.to_yaml()) + +.. The json output will be: + +.. .. code-block:: json + +.. { +.. "name": "Jane Doe", +.. "age": 25 +.. } +.. You can use `json.loads` to convert the json string back to a dictionary. + +.. The yaml output will be: + +.. .. code-block:: yaml + +.. name: "John Doe" +.. age: 25 - # Output - # MyOutputs(name='Jane Doe', age=25) +.. You can use `yaml.safe_load` to convert the yaml string back to a dictionary. -In most cases, your dataset's key and the field name might not directly match. -Instead of providing a mapping argument in the library, we suggest users to customize `from_dict` method for more **control** and **flexibility**. -Here is a real-world example: + + +Load data from dataset as example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As we need to load or create an instance from a dataset, which is typically from Pytorch dataset or huggingface dataset and each data point is in +the form of a dictionary. + +How you want to describe your data format to LLMs might not match to the existing dataset's key and the field name. +You can simply do a bit customization to map the dataset's key to the field name in your data class. .. code-block:: python + @dataclass class OutputFormat(DataClass): thought: str = field( metadata={ @@ -243,7 +580,7 @@ Here is a real-world example: class_index: int = field(metadata={"desc": "class_index in range[0, 5]"}) @classmethod - def from_dict(cls, data: Dict[str, Any]): + def from_dict(cls, data: Dict[str, object]): _COARSE_LABELS_DESC = [ "Abbreviation", "Entity", @@ -260,14 +597,30 @@ Here is a real-world example: return super().from_dict(data) .. note:: - + If you are looking for data types we used to support each component or any other class like `Optimizer`, you can check out the :ref:`core.types` file. +.. admonition:: References + :class: highlight + + 1. Dataclasses: https://docs.python.org/3/library/dataclasses.html + + + +.. admonition:: API References + :class: highlight + + - :class:`core.base_data_class.DataClass` + - :class:`core.base_data_class.DataClassFormatType` + - :func:`core.functional.custom_asdict` + - :ref:`core.base_data_class` + + .. Document .. ------------ .. We defined `Document` to function as a `string` container, and it can be used for any kind of text data along its `metadata` and relations .. such as `parent_doc_id` if you have ever splitted the documents into chunks, and `embedding` if you have ever computed the embeddings for the document. -.. It functions as the data input type for some `string`-based components, such as `DocumentSplitter`, `Retriever`. \ No newline at end of file +.. It functions as the data input type for some `string`-based components, such as `DocumentSplitter`, `Retriever`. diff --git a/docs/source/developer_notes/class_hierarchy.rst b/docs/source/developer_notes/class_hierarchy.rst new file mode 100644 index 00000000..074aad79 --- /dev/null +++ b/docs/source/developer_notes/class_hierarchy.rst @@ -0,0 +1,30 @@ +Class Hierarchy +============================= +From the plot of the `LightRAG` library's class hierarchy, we can see the library is well-centered around two base classes: `Component` and `DataClass`, and it has no more than two levels of subclasses. +This design philosophy results in a library with bare minimum abstraction, providing developers with maximum customizability. + +.. raw:: html + + + +
+ +
diff --git a/docs/source/developer_notes/component.rst b/docs/source/developer_notes/component.rst index f06706af..026d5a71 100644 --- a/docs/source/developer_notes/component.rst +++ b/docs/source/developer_notes/component.rst @@ -1,11 +1,17 @@ Component ============ + +.. .. admonition:: Author +.. :class: highlight + +.. `Li Yin `_ + What you will learn? 1. What is ``Component`` and why is it designed this way? 2. How to use ``Component`` along with helper classes like ``FunComponent`` and ``Sequential``? -Component +Design --------------------------------------- :ref:`Component` is to LLM task pipelines what ``nn.Module`` is to PyTorch models. @@ -59,6 +65,25 @@ Here is the comparison of writing a PyTorch model and a LightRAG task component. def call(self, query: str) -> str: return self.doc(prompt_kwargs={"input_str": query}).data + +As the foundamental building block in LLM task pipeline, the component is designed to serve five main purposes: + +1. **Standarize the interface for all components.** This includes the `__init__` method, the `call` method for synchronous call, the `acall` method for asynchronous call, and the `__call__` which in default calls the `call` method. +2. **Provide a unified way to visualize the structure of the task pipeline** via `__repr__` method. And subclass can additional add `_extra_repr` method to add more information than the default `__repr__` method. +3. **Tracks, adds all subcomponents and parameters automatically and recursively** to assistant the building and optimizing process of the task pipeline. +4. **Manages the states and serialization**, with `state_dict` and `load_state_dict` methods in particular for parameters and `to_dict` method for serialization of all the states fall into the component's attributes, from subcomponents to parameters, to any other attributes of various data type. +5. **Make all components configurable from using `json` or `yaml` files**. This is especially useful for experimenting or building data processing pipelines. + +These features are key to keep LightRAG pipeline transparent, flexible, and easy to use. +By subclassing from the `Component` class, you will get most of these features out of the box. + + +Component in Action +--------------------------------------- + +.. Transparency +.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In this note, we are creating an AI doctor to answer medical questions. Run the ``DocQA`` on a query: @@ -73,22 +98,8 @@ The response is: As a doctor, the best treatment for a headache would depend on the underlying cause of the headache. Typically, over-the-counter pain relievers such as acetaminophen, ibuprofen, or aspirin can help to alleviate the pain. However, if the headache is severe or persistent, it is important to see a doctor for further evaluation and to determine the most appropriate treatment option. Other treatment options may include prescription medications, lifestyle modifications, stress management techniques, and relaxation techniques. -As the foundamental building block in LLM task pipeline, the component is designed to serve four main purposes: - -1. **Standarize the interface for all components.** This includes the `__init__` method, the `call` method for synchronous call, the `acall` method for asynchronous call, and the `__call__` which in default calls the `call` method. -2. **Provide a unified way to visualize the structure of the task pipeline** via `__repr__` method. And subclass can additional add `_extra_repr` method to add more information than the default `__repr__` method. -3. **Tracks, adds all subcomponents and parameters automatically and recursively** to assistant the building and optimizing process of the task pipeline. -4. **Manages the states and serialization**, with `state_dict` and `load_state_dict` methods in particular for parameters and `to_dict` method for serialization of all the states fall into the component's attributes, from subcomponents to parameters, to any other attributes of various data type. - - -Here are the benefits of using the Component class: - -- Transparency. -- Flexibility. -- Searialization and deserialization. - -.. Transparency -.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Print the structure +~~~~~~~~~~~~~~~~~~~~~ We can easily visualize the structure via `print`: @@ -103,15 +114,16 @@ The printout: DocQA( - (doc): Generator( - model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM - (prompt): Prompt(template: You are a doctor User: {{input_str}}, prompt_variables: ['input_str']) - (model_client): OpenAIClient() - ) + (doc): Generator( + model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM + (prompt): Prompt(template: You are a doctor User: {{input_str}}, prompt_variables: ['input_str']) + (model_client): OpenAIClient() + ) ) - +Configure from file +~~~~~~~~~~~~~~~~~~~~~ @@ -144,7 +156,8 @@ You can easily save the detailed states: To adds even more flexibility, we provide :class:`core.component.FunComponent` and :class:`core.component.Sequential` for more advanced use cases. -**Searalization and deserialization** +Searalization and deserialization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We provide ``is_pickable`` method to check if the component is pickable. And any of your component, it is a good practise to ensure it is pickable. @@ -239,13 +252,13 @@ The structure of the sequence using ``print(seq)``: Sequential( (0): EnhanceQueryComponent() (1): DocQA( - (doc): Generator( - model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM - (prompt): Prompt(template: You are a doctor User: {{input_str}}, prompt_variables: ['input_str']) - (model_client): OpenAIClient() + (doc): Generator( + model_kwargs={'model': 'gpt-3.5-turbo'}, model_type=ModelType.LLM + (prompt): Prompt(template: You are a doctor User: {{input_str}}, prompt_variables: ['input_str']) + (model_client): OpenAIClient() + ) ) ) - ) .. admonition:: API reference :class: highlight diff --git a/docs/source/developer_notes/db.rst b/docs/source/developer_notes/db.rst index a1a754bd..f86f869c 100644 --- a/docs/source/developer_notes/db.rst +++ b/docs/source/developer_notes/db.rst @@ -1,7 +1,13 @@ Data & RAG ==================== - The purpose of this note is to provide an overview on data, data modeling, and data storage in LLM applications along with how LightRAG works with data. +.. admonition:: Author + :class: highlight + + `Li Yin `_ + + +The purpose of this note is to provide an overview on data, data modeling, and data storage in LLM applications along with how LightRAG works with data. We will conver: * Data models on how to represent important data. @@ -22,7 +28,7 @@ However, in real-world LLM applications, we can not avoid to deal with data stor 4. When it comes to applications where states matter, like games and chatbots, we need to store the states and conversational history. -.. figure:: /_static/database.png +.. figure:: /_static/images/database.png :align: center :alt: Data model and database :width: 620px diff --git a/docs/source/developer_notes/embedder.rst b/docs/source/developer_notes/embedder.rst index 4e71c74b..a9d040b1 100644 --- a/docs/source/developer_notes/embedder.rst +++ b/docs/source/developer_notes/embedder.rst @@ -1,5 +1,10 @@ Embedder ============ +.. admonition:: Author + :class: highlight + + `Li Yin `_ + What you will learn? 1. What is ``Embedder`` and why is it designed this way? @@ -62,10 +67,10 @@ We find the ``model_kwargs`` from the OpenAI API documentation. We setup `query` **Visualize structure**: we use ``print(embedder)``. The output will be: -.. code-block:: +.. code-block:: Embedder( - model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, + model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, (model_client): OpenAIClient() ) @@ -140,7 +145,7 @@ If we want to decreate the embedding dimension to only 256 to save memory, we ca new_embedding = normalize_vector(new_embedding) embedding.embedding = new_embedding return output - + def _extra_repr(self) -> str: repr_str = f"old_dim={self.old_dim}, new_dim={self.new_dim}, normalize={self.normalize}" return repr_str @@ -159,10 +164,10 @@ Putting it all together, we can create a new embedder with the output processor. The structure looks like: -.. code-block:: +.. code-block:: Embedder( - model_kwargs={'model': 'thenlper/gte-base'}, + model_kwargs={'model': 'thenlper/gte-base'}, (model_client): TransformersClient() (output_processors): DecreaseEmbeddingDim(old_dim=768, new_dim=256, normalize=True) ) @@ -188,7 +193,7 @@ The BatchEmbedder orchestrates the ``Embedder`` and handles the batching process .. code-block:: python from lightrag.core.embedder import BatchEmbedder - + batch_embedder = BatchEmbedder(embedder=local_embedder, batch_size=100) queries = [query] * 1000 @@ -216,4 +221,4 @@ The BatchEmbedder orchestrates the ``Embedder`` and handles the batching process - :class:`core.types.Embedding` - :class:`components.model_client.openai_client.OpenAIClient` - :class:`components.model_client.transformers_client.TransformersClient` - - :class:`core.functional.normalize_vector` \ No newline at end of file + - :class:`core.functional.normalize_vector` diff --git a/docs/source/developer_notes/evaluation.rst b/docs/source/developer_notes/evaluation.rst index a5f563ea..9b9315eb 100644 --- a/docs/source/developer_notes/evaluation.rst +++ b/docs/source/developer_notes/evaluation.rst @@ -1,7 +1,12 @@ -A Guideline on LLM Evaluation +LLM Evaluation ==================================== -As the saying goes, "You cannot improve what you cannot measure". This is especially true in the context of LLMs, which have become increasingly popular due to their impressive performance on a wide range of tasks. Evaluating LLMs and their applications is crucial in both research and production to understand their capabilities and limitations. +.. admonition:: Author + :class: highlight + + `Meng Liu `_ + +"You cannot improve what you cannot measure". This is especially true in the context of LLMs, which have become increasingly popular due to their impressive performance on a wide range of tasks. Evaluating LLMs and their applications is crucial in both research and production to understand their capabilities and limitations. Overall, such evaluation is a complex and multifaceted process. Below, we provide a guideline for evaluating LLMs and their applications, incorporating aspects outlined by *Chang et al.* [1]_: * **What to evaluate**: the tasks and capabilities that LLMs are evaluated on. diff --git a/docs/source/developer_notes/generator.rst b/docs/source/developer_notes/generator.rst index 00625a47..08fdf804 100644 --- a/docs/source/developer_notes/generator.rst +++ b/docs/source/developer_notes/generator.rst @@ -1,205 +1,451 @@ .. _generator: -Generator +Generator ========= -*The Center of it All* -Generator is the most essential functional component in LightRAG. -It is a user-facing orchestration component with a simple and unified interface for LLM prediction. -It orchestrates the following components along with their required arguments: +.. .. admonition:: Author +.. :class: highlight -- ``Prompt`` +.. `Li Yin `_ -- ``ModelClient`` +.. *The Center of it All* + +`Generator` is a user-facing orchestration component with a simple and unified interface for LLM prediction. +It is a pipeline that consists of three subcomponents. + +Design +--------------------------------------- + +.. figure:: /_static/images/generator.png + :align: center + :alt: LightRAG generator design + :width: 700px + + Generator-The orchestrator for LLM prediction + +:class:`Generator` is designed to achieve the following goals: + +- Model Agnostic: The Generator should be able to call any LLM model with the same prompt. +- Unified Interface: It should manage the pipeline of prompt(input)->model call -> output parsing. +- Unified Output: This will make it easy to log and save records of all LLM predictions. +- Work with Optimizer: It should be able to work with Optimizer to optimize the prompt. + +An orchestrator +^^^^^^^^^^^^^^^^^ + +It orchestrates three components: + +- ``Prompt``: by taking in ``template`` (string) and ``prompt_kwargs`` (dict) to format the prompt at initialization. When the ``template`` is not given, it defaults to :const:`DEFAULT_LIGHTRAG_SYSTEM_PROMPT`. + +- ``ModelClient``: by taking in already instantiated ``model_client`` and ``model_kwargs`` to call the model. Switching out the model client will allow you to call any LLM model on the same prompt and output parsing. + +- ``output_processors``: component or chained components via ``Sequential`` to process the raw response to desired format. If no output processor provided, it is decided by Model client, often return raw string response (from the first response message). + +**Call and arguments** + +Generator supports both ``call`` (``__call__``) and ``acall`` method. +They take two optional arguments: + +- ``prompt_kwargs`` (dict): to be passed to its ``Prompt`` component. +- ``model_kwargs`` (dict): will be combined with the ``model_kwargs`` from the initial model client. + +The generator will call ``Prompt`` to format the final prompt and adapt the inputs to ones workable with ``ModelClient``. +In particular, it passes: + +- Formatted prompt after calling ``Prompt``. +- All combined ``model_kwargs`` and :const:`ModelType.LLM` to the ``ModelClient``. + +.. note :: + + This also means any ``ModelClient`` who wants to be compatible with `Generator` should take in ``model_kwargs`` and ``model_type`` as arguments. -- Output processors to process the raw string response to desired format. -By switching out the model client, you can call any LLM model on your prompt, either API or local. GeneratorOutput -^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^ Different from all other components, we can not alway enforce LLM to output the right format. -We in particular created a :class:`core.types.GeneratorOutput` (a subclass of ``DataClass``) to store `data` (parsed response), `error` (error message if either the model inference SDKs fail or the output parsing fail) and `raw_response` (raw string response for reference) for any LLM predictions. -It is in developers' hands to process the output accordingly. +Some part of the `Generator` pipeline can fail. + +.. note:: + Whenever there is an error happens, we do not raise the error and stop this pipeline. + Instead, `Generator` will always return an output record. + We made this design choice as it can be really helpful to log various failed cases on your test examples without stopping the pipeline for further investigation and improvement. + +In particular, we created :class:`GeneratorOutput` (subclass of ``DataClass``) to capture important information. + +- `data` (object) : to store the final processed response after all three components in the pipeline. This means `success`. +- `error` (str): error message if any of the three components in the pipeline fail. When this is not `None`, it means `failure`. +- `raw_response` (str): raw string response for reference for any LLM predictions. For now it is a string, which comes from the first response message. [This might change and be different in the future] +- `metadata` (dict): to store any additional information and `usage` reserved to track the usage of the LLM prediction. + +Whether to do further processing or terminate the pipeline whenever an error happens is up to the user from here on. + +Generator In Action +--------------------------------------- + +We will create a simple one-turn chatbot to demonstrate how to use the Generator in action. -GeneratorInAction +Minimum Example ^^^^^^^^^^^^^^^^^ -Beside of these examples, LLM is like water, even in our library, we have components that have adpated Generator to other various functionalities. -- :class:`components.retriever.llm_retriever.LLMRetriever` is a retriever that uses Generator to call LLM to retrieve the most relevant documents. -- :class:`eval.llm_as_judge.DefaultLLMJudge` is a judge that uses Generator to call LLM to evaluate the quality of the response. -- :class:`optim.llm_optimizer.LLMOptimizer` is an optimizer that uses Generator to call LLM to optimize the prompt. +The minimum setup to initiate a generator in the code: -Tracing -^^^^^^^^^^^ -In particular, we provide two tracing methods to help you develop and improve the Generator: +.. code-block:: python + + from lightrag.core import Generator + from lightrag.components.model_client import GroqAPIClient + + generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + ) + print(generator) + +The structure of generator using ``print``: + +.. raw:: html + +
+
+            
+        Generator(
+        model_kwargs={'model': 'llama3-8b-8192'},
+        (prompt): Prompt(
+            template: {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}
+            
+            {% endif %}
+            {# task desc #}
+            {% if task_desc_str %}
+            {{task_desc_str}}
+            {% endif %}
+            {# output format #}
+            {% if output_format_str %}
+            
+            {{output_format_str}}
+            
+            {% endif %}
+            {# tools #}
+            {% if tools_str %}
+            
+            {{tools_str}}
+            
+            {% endif %}
+            {# example #}
+            {% if examples_str %}
+            
+            {{examples_str}}
+            
+            {% endif %}
+            {# chat history #}
+            {% if chat_history_str %}
+            
+            {{chat_history_str}}
+            
+            {% endif %}
+            {#contex#}
+            {% if context_str %}
+            
+            {{context_str}}
+            
+            {% endif %}
+            {# steps #}
+            {% if steps_str %}
+            
+            {{steps_str}}
+            
+            {% endif %}
+            {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}
+            
+            {% endif %}
+            {% if input_str %}
+            
+            {{input_str}}
+            
+            {% endif %}
+            You:
+            , prompt_variables: ['output_format_str', 'chat_history_str', 'task_desc_str', 'context_str', 'steps_str', 'input_str', 'tools_str', 'examples_str']
+        )
+        (model_client): GroqAPIClient()
+    )
+            
+        
+
+ +**Show the final prompt** + +`Generator` 's ``print_prompt`` method will simply relay the method from the `Prompt` component: + +.. code-block:: python + + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) -1. Trace the history change(states) on prompt during your development process. Developers typically go through a long process of prompt optimization and it is frustrating -to lose track of the prompt changes when your current change actually makes the performance much worse. +The output will be the formatted prompt: -We created a `GeneratorStateLogger` to handle the logging and saving into json files. To further simplify developers's process, -we provides a class decorator `trace_generator_states` where a single line of code can be added to any of your task component. -It will automatically track any attributes of type `Generator`. +.. code-block:: + + + What is LLM? Explain in one sentence. + + You: + + + +**Call the generator** .. code-block:: python - from tracing import trace_generator_states - from core import Component, Generator + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) - @trace_generator_states() - class SimpleQA(Component): - def __init__(self): - super().__init__() - self.generator = Generator(...) - self.generator_2 = Generator(...) - def call(...): +The output will be the `GeneratorOutput` object: -In default, a dir from the current working directory will be created to store the log files. -The project name in defaul is `SimpleQA` and the log file will be named as `generator_state_trace.json` -where both the `generator` and `generator_2` will be logged. -The structure of log directory is as follows: +.. code-block:: -.. code-block:: bash + GeneratorOutput(data='LLM stands for Large Language Model, a type of artificial intelligence that is trained on vast amounts of text data to generate human-like language outputs, such as conversations, text, or summaries.', error=None, usage=None, raw_response='LLM stands for Large Language Model, a type of artificial intelligence that is trained on vast amounts of text data to generate human-like language outputs, such as conversations, text, or summaries.', metadata=None) - . - ├── traces - │ ├── SimpleQA - │ │ ├── generator_state_trace.json +Use template +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In this example, we will use a customized template to format the prompt. +We intialized the prompt with one variable `task_desc_str` and it is further combined with the `input_str` in the prompt. + +.. code-block:: python + template = r"""{{task_desc_str}} + User: {{input_str}} + You:""" + generator = Generator( + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + template=template, + prompt_kwargs={"task_desc_str": "You are a helpful assistant"}, + ) + prompt_kwargs = {"input_str": "What is LLM?"} -Here is an example log file: + generator.print_prompt( + **prompt_kwargs, + ) + output = generator( + prompt_kwargs=prompt_kwargs, + ) -.. code-block:: json +The final prompt is: +.. code-block:: + + You are a helpful assistant + User: What is LLM? + You: + +.. note:: + + It is quite straightforward to use any prompt. + They only need to stick to ``jinja2`` syntax. + + +Use output_processors +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In this example, we will instruct LLM to output a JSON object to respond. +We will use the `JsonParser` to parse the output back to a `dict` object. + +.. code-block:: python + + from lightrag.core import Generator + from lightrag.core.types import GeneratorOutput + from lightrag.components.model_client import OpenAIClient + from lightrag.core.string_parser import JsonParser + + output_format_str = r"""Your output should be formatted as a standard JSON object with two keys: { - "generator": [ - { - "prompt_states": { - "_components": {}, - "_parameters": {}, - "training": false, - "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", - "prompt_variables": [ - "chat_history_str", - "context_str", - "examples_str", - "input_str", - "output_format_str", - "output_str", - "steps_str", - "task_desc_str", - "tools_str" - ], - "preset_prompt_kwargs": { - "task_desc_str": "You are a helpful assistant and with a great sense of humor." - } + "explaination": "A brief explaination of the concept in one sentence.", + "example": "An example of the concept in a sentence." + } + """ + + generator = Generator( + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + prompt_kwargs={"output_format_str": output_format_str}, + output_processors=JsonParser(), + ) + + prompt_kwargs = {"input_str": "What is LLM?"} + generator.print_prompt(**prompt_kwargs) + + output: GeneratorOutput = generator(prompt_kwargs=prompt_kwargs) + print(type(output.data)) + print(output.data) + +The final prompt is: + +.. code-block:: + + + + + Your output should be formatted as a standard JSON object with two keys: + { + "explaination": "A brief explaination of the concept in one sentence.", + "example": "An example of the concept in a sentence." + } + + + + + What is LLM? + + You: + +The output of the call is: + +.. code-block:: + + + {'explaination': 'LLM stands for Large Language Model, which are deep learning models trained on enormous amounts of text data.', 'example': 'An example of a LLM is GPT-3, which can generate human-like text based on the input provided.'} + +Switch model client +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Also, did you notice that we have already switched to use models from `OpenAI` in the above example? +This is how easy to switch the model client in the Generator, making it a truly model-agnostic component. +We can even use :class:`ModelClientType` to switch the model client without handling multiple imports. + +.. code-block:: python + + from lightrag.core.types import ModelClientType + + generator = Generator( + model_client=ModelClientType.OPENAI(), # or ModelClientType.GROQ() + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + +Get errors in the output +^^^^^^^^^^^^^^^^^^^^^^^^^ + +We will use a wrong API key to delibrately create an error. +We will still get a response, but only with empty ``data`` and an error message. +Here is the api key error with OpenAI: + +.. code-block:: python + + GeneratorOutput(data=None, error="Error code: 401 - {'error': {'message': 'Incorrect API key provided: ab. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}", usage=None, raw_response=None, metadata=None) + + +Create from configs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Same as all components, we can create the generator purely from configs. + +**Know it is a Generator** + +In this case, we know we are creating a generator, we will use ``from_config`` method from the ``Generator`` class. + +.. code-block:: python + + from lightrag.core import Generator + + config = { + "model_client": { + "component_name": "GroqAPIClient", + "component_config": {}, + }, + "model_kwargs": { + "model": "llama3-8b-8192", + }, + } + + generator: Generator = Generator.from_config(config) + print(generator) + + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) + + +**Purely from the configs** + +This is even more general. +This method fits to create any component from configs. +We just need to follow the config structure: ``component_name`` and ``component_config`` for all arguments. + +.. code-block:: python + + from lightrag.utils.config import new_component + from lightrag.core import Generator + + config = { + "generator": { + "component_name": "Generator", + "component_config": { + "model_client": { + "component_name": "GroqAPIClient", + "component_config": {}, }, - "time_stamp": "2024-06-02T15:55:21.765794" - }, - { - "prompt_states": { - "_components": {}, - "_parameters": {}, - "training": false, - "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", - "prompt_variables": [ - "chat_history_str", - "context_str", - "examples_str", - "input_str", - "output_format_str", - "output_str", - "steps_str", - "task_desc_str", - "tools_str" - ], - "preset_prompt_kwargs": { - "task_desc_str": "You are a helpful assistant and with a great sense of humor. Second edition." - } + "model_kwargs": { + "model": "llama3-8b-8192", }, - "time_stamp": "2024-06-02T15:56:37.756148" - } - ], - "generator2": [ - { - "prompt_states": { - "_components": {}, - "_parameters": {}, - "training": false, - "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", - "prompt_variables": [ - "chat_history_str", - "context_str", - "examples_str", - "input_str", - "output_format_str", - "output_str", - "steps_str", - "task_desc_str", - "tools_str" - ], - "preset_prompt_kwargs": { - "task_desc_str": "You are the second generator." - } }, - "time_stamp": "2024-06-03T16:44:45.223220" } - ] } - -2. Trace all failed LLM predictions for further improvement. -Similarly, :class:`tracing.generator_call_logger.GeneratorCallLogger` is created to log generator call input arguments and output results. -`trace_generator_call` decorator is provided to provide one-line setup to trace calls, which in default will log only failed predictions. + generator: Generator = new_component(config["generator"]) + print(generator) -Adding the second decorator to the above example: + prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."} + generator.print_prompt(**prompt_kwargs) + output = generator( + prompt_kwargs=prompt_kwargs, + ) + print(output) -.. code-block:: python +It works exactly the same as the previous example. +We imported ``Generator`` in this case to only show the type hinting. - from tracing import trace_generator_errors +.. note:: - @trace_generator_call() - @trace_generator_states() - class SimpleQA(Component): - def __init__(self): - super().__init__() - self.generator = Generator(...) - self.generator_2 = Generator(...) - def call(...): + Please refer the :doc:`configurations` for more details on how to create components from configs. -Now, three more files will be created in the log directory: -.. code-block:: bash +Examples across the library +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - . - ├── traces - │ ├── SimpleQA - │ │ ├── logger_metadata.json - │ │ ├── generator_call.jsonl - │ │ ├── generator_2_call.jsonl +Beside of these examples, LLM is like water, even in our library, we have components that have adpated Generator to other various functionalities. -The `logger_metadata.json` file contains the metadata of the logger, it looks like this: +- :class:`LLMRetriever` is a retriever that uses Generator to call LLM to retrieve the most relevant documents. +- :class:`DefaultLLMJudge` is a judge that uses Generator to call LLM to evaluate the quality of the response. +- :class:`LLMOptimizer` is an optimizer that uses Generator to call LLM to optimize the prompt. -.. code-block:: json +Tracing +--------------------------------------- +In particular, we provide two tracing methods to help you develop and improve the ``Generator``: - { - "generator": "./traces/SimpleQA/generator_call.jsonl", - "generator2": "./traces/SimpleQA/generator2_call.jsonl" - } +1. Trace the history change (states) on prompt during your development process. + +Developers typically go through a long process of prompt optimization, and it is frustrating to lose track of the prompt changes when your current change actually makes the performance much worse. +We created a :class:`GeneratorStateLogger` to handle the logging and saving into JSON files. To further simplify the developer's process, we provide a class decorator `trace_generator_states` where a single line of code can be added to any of your task components. It will automatically track any attributes of type `Generator`. -The `generator_call.jsonl` file contains the log of all calls to the generator, it looks like this: +2. Trace all failed LLM predictions for further improvement. -.. code-block:: json +Similarly, :class:`GeneratorCallLogger` is created to log generator call input arguments and output results. +The `trace_generator_call` decorator is provided to offer a one-line setup to trace calls, which by default will log only failed predictions. - {"prompt_kwargs": {"input_str": "What is the capital of France?"}, "model_kwargs": {}, "output": {"data": "Bonjour!\n\nThe capital of France is Paris, of course! But did you know that the Eiffel Tower in Paris is actually the most-visited paid monument in the world? Mind-blowing, right?\n\nNow, would you like to know some more fun facts or perhaps ask another question? I'm all ears (or should I say, all eyes?)", "error_message": null, "raw_response": "Bonjour!\n\nThe capital of France is Paris, of course! But did you know that the Eiffel Tower in Paris is actually the most-visited paid monument in the world? Mind-blowing, right?\n\nNow, would you like to know some more fun facts or perhaps ask another question? I'm all ears (or should I say, all eyes?)"}, "time_stamp": "2024-06-03T16:44:45.582859"} +.. note:: + + This note is getting rather long. Please go to the :doc:`tracing` for more details on how to use these tracing methods. -.. note :: - Usually, let the evaluation run on evaluation to collect as much as failed predictions can be highly helpful for either manual prompting or auto-prompt engineering (APE). Training [Experimental] -^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------- +Coming soon! .. A Note on Tokenization# .. By default, LlamaIndex uses a global tokenizer for all token counting. This defaults to cl100k from tiktoken, which is the tokenizer to match the default LLM gpt-3.5-turbo. @@ -211,5 +457,14 @@ Training [Experimental] - :class:`core.generator.Generator` - :class:`core.types.GeneratorOutput` + - :class:`core.default_prompt_template.DEFAULT_LIGHTRAG_SYSTEM_PROMPT` + - :class:`core.types.ModelClientType` + - :class:`core.types.ModelType` + - :class:`core.string_parser.JsonParser` + - :class:`core.prompt_builder.Prompt` - :class:`tracing.generator_call_logger.GeneratorCallLogger` - :class:`tracing.generator_state_logger.GeneratorStateLogger` + - :class:`components.retriever.llm_retriever.LLMRetriever` + - :class:`eval.llm_as_judge.DefaultLLMJudge` + - :class:`optim.llm_optimizer.LLMOptimizer` + - :func:`utils.config.new_component` diff --git a/docs/source/developer_notes/index.rst b/docs/source/developer_notes/index.rst index 69c8dcac..463b03a5 100644 --- a/docs/source/developer_notes/index.rst +++ b/docs/source/developer_notes/index.rst @@ -1,17 +1,20 @@ .. _developer_notes: -Developer Notes +Tutorials ============================= -*Why and How each part works* +.. *Why and How Each Part works* -Learn LightRAG design phisolophy and the `why` and `how-to` (customize and integrate) behind each core part within the LightRAG library. -This is our tutorials before you move ahead to build use cases (LLM applications) end to end. +Learn the `why` and `how-to` (customize and integrate) behind each core part within the `LightRAG` library. +These are our most important tutorials before you move ahead to build use cases (LLM applications) end to end. -.. note:: - You can read interchangably between :ref:`Use Cases `. +.. raw:: + + .. note:: + + You can read interchangably between :ref:`Use Cases `. @@ -20,32 +23,32 @@ This is our tutorials before you move ahead to build use cases (LLM application :align: center :width: 600px - LLM application is no different from a mode training/eval workflow + LLM application is no different from a mode training/evaluation workflow .. :height: 100px .. :width: 200px -LightRAG library focus on providing building blocks for developers to **build** and **optimize** the `task pipeline`. -We have clear design phisolophy: - +The `LightRAG` library focuses on providing building blocks for developers to **build** and **optimize** the task pipeline. +We have a clear :doc:`lightrag_design_philosophy`, which results in this :doc:`class_hierarchy`. .. toctree:: :maxdepth: 1 + :caption: Introduction + :hidden: lightrag_design_philosophy - - llm_intro + class_hierarchy Building -============================= - +------------------- Base classes ---------------- -Code path: ``lightrag.core``. +~~~~~~~~~~~~~~~~~~~~~~ +Code path: :ref:`lightrag.core `. + .. list-table:: :widths: 20 80 @@ -54,27 +57,33 @@ Code path: ``lightrag.core``. * - Base Class - Description * - :doc:`component` - - Similar to ``Module`` in `PyTorch`, it standardizes the interface of all components with `call`, `acall`, and `__call__` methods, handles states, and serialization. Components can be easily chained togehter via `Sequential` for now. + - The building block for task pipeline. It standardizes the interface of all components with `call`, `acall`, and `__call__` methods, handles state serialization, nested components, and parameters for optimization. Components can be easily chained together via ``Sequential``. * - :doc:`base_data_class` - - Leverages the ``dataclasses`` module in Python to ease the data interaction with prompt and serialization. + - The base class for data. It eases the data interaction with LLMs for both prompt formatting and output parsing. .. create side bar navigation + .. toctree:: :maxdepth: 1 + :caption: Base Classes :hidden: component base_data_class RAG Essentials -------------------- -Code path: ``lightrag.core``. For abstract classes: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +RAG components +^^^^^^^^^^^^^^^^^^^ -- ``ModelClient``: the functional subclass is in ``lightrag.components.model_client``. -- ``Retriever``: the functional subclass is in ``lightrag.components.retriever``. It works hand-in-hand with the ``LocalDB`` and Cloud DB in ``lightrag.database``. + +Code path: :ref:`lightrag.core`. For abstract classes: + +- ``ModelClient``: the functional subclass is in :ref:`lightrag.components.model_client`. +- ``Retriever``: the functional subclass is in :ref:`lightrag.components.retriever`. .. list-table:: @@ -84,17 +93,20 @@ Code path: ``lightrag.core``. For abstract classes: * - Part - Description * - :doc:`prompt` - - Built on ``jinja2``, it programmablly and flexibly format prompt(text) as **input to the generator**. + - Built on `jinja2`, it programmatically and flexibly formats prompts as input to the generator. * - :doc:`model_client` - ``ModelClient`` is the protocol and base class for LightRAG to **integrate all models**, either APIs or local, LLMs or Embedding models or any others. * - :doc:`generator` - The **center component** that orchestrates the model client(LLMs in particular), prompt, and output processors for format parsing or any post processing. + * - :doc:`output_parsers` + - The component that parses the output string to structured data. * - :doc:`embedder` - The component that orchestrates model client (Embedding models in particular) and output processors. * - :doc:`retriever` - The base class for all retrievers who in particular retrieve relevant documents from a given database to add **context** to the generator. - +Data Pipeline and Storage +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Data Processing: including transformer, pipeline, and storage. Code path: ``lightrag.components.data_process``, ``lightrag.core.db``, and ``lightrag.database``. Components work on a sequence of ``Document`` and return a sequence of ``Document``. @@ -108,7 +120,7 @@ Components work on a sequence of ``Document`` and return a sequence of ``Documen * - :doc:`text_splitter` - To split long text into smaller chunks to fit into the token limits of embedder and generator or to ensure more relevant context while being used in RAG. * - :doc:`db` - - Understanding the data modeling, processing, and storage as a whole. We will build a chatbot with enhanced memory and memoy retrieval in this note (RAG). + - Understanding the **data modeling, processing, and storage** as a whole. We will build a chatbot with enhanced memory and memoy retrieval in this note (RAG). .. * - :doc:`data_pipeline` @@ -118,29 +130,42 @@ Components work on a sequence of ``Document`` and return a sequence of ``Documen .. toctree:: :maxdepth: 1 + :caption: RAG Essentials :hidden: prompt model_client generator + output_parsers embedder retriever text_splitter db - rag + + Agent Essentials ------------------------------ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Agent in ``components.agent`` is LLM great with reasoning, planning, and using tools to interact and accomplish tasks. .. list-table:: - :widths: 20 80 + :widths: 30 70 :header-rows: 1 - * - :doc: `tool_helper` - - Provide tools (function calls) to interact with the generator. - * - :doc: `agent` - - The ReactAgent. + * - Part + - Description + * - :doc:`tool_helper` + - Provide tools (function calls) to interact with the generator. + * - :doc:`agent` + - The ReactAgent. + +.. toctree:: + :maxdepth: 1 + :caption: Agent Essentials + :hidden: + + tool_helper + agent .. Core functionals .. ------------------- @@ -167,14 +192,14 @@ Agent in ``components.agent`` is LLM great with reasoning, planning, and using t Optimizing -============================= +------------------- Datasets and Evaulation .. toctree:: :maxdepth: 1 + :caption: Datasets and Evaulation - configs datasets @@ -185,6 +210,7 @@ Optimizer & Trainer .. toctree:: :maxdepth: 1 + :caption: Optimizer & Trainer parameter @@ -192,8 +218,8 @@ Optimizer & Trainer trainer -Logging & Tracing -============================= +Logging & Tracing & Configurations +------------------------------------ Code path: ``lightrag.utils``. .. list-table:: @@ -207,7 +233,10 @@ Code path: ``lightrag.utils``. .. toctree:: :maxdepth: 1 + :caption: Logging & Tracing & Configurations :hidden: + logging logging_tracing + configs diff --git a/docs/source/developer_notes/lightrag_design_philosophy.rst b/docs/source/developer_notes/lightrag_design_philosophy.rst index bad3a501..462fd228 100644 --- a/docs/source/developer_notes/lightrag_design_philosophy.rst +++ b/docs/source/developer_notes/lightrag_design_philosophy.rst @@ -1,52 +1,34 @@ -LightRAG Design Philosophy +Design Philosophy ==================================== -.. Deep understanding of the LLM workflow -.. --------------------------------------- +Right from the begining, `LightRAG` follows three fundamental principles. -LLMs are like `water`, it is all up to users to shape it into different use cases. In `PyTorch`, most likely users do not need to build their -own ``conv`` or ``linear`` module, or their own ``Adam`` optimizer. Their building blocks can meet > 90% of their user's needs on `building` and -`optimizing` (training) their models, leaving less than 10% of users, mostly contributors and researchers to build their own ``Module``, ``Tensor``, -``Optimizer``, etc. Libraries like `PyTorch`, `numpy`, `scipy`, `sklearn`, `pandas`, etc. are all doing the heavy lifting on the computation optimization. -However, for developers to write their own LLM task pipeline, calling apis or using local LLMs to shape the LLMs via prompt into any use case is not a hard feat. -The hard part is on `evaluating` and `optimizing` their task pipeline. -Optimizing over Building +Principle 1: Quality over Quantity ----------------------------------------------------------------------- - We help users to build the task pipeline, but we want to help on optimizing even more so. + The Quality of core building blocks over the Quantity of integrations. -In fact, building the task pipeline accounts for only **10%** of users' development process, the other **90%** is on optimtizing and iterating. -The most popular libraries like ``Langchain`` and ``LlamaIndex`` are mainly focusing on `building` the task pipeline, prioritizing integrations and coveraging on different type of tasks, resulting large amounts of classes, each -with many layers of class inheritance. With the existing libraries, users get stuck on just following the examples, and it requires more time for them to figure out customization than writing their -own code. +We aim to provide developers with well-designed core building blocks that are **easy** to understand, **transparent** to debug, and **flexible** enough to customize. +This goes for the prompt, the model client, the retriever, the optimizer, and the trainer. -How to `build` the task pipeline has starting to mature: `prompt`, `retriever`, `generator`, `RAG`, `Agent` has becoming well-known concepts. -How to `optimize` the task pipeline is still a mystery to most users. And most are still doing `manual` prompt engineering without good -`observability` (or `debugging` ) tools. And these existing `observability` tools are mostly commercialized, prioritizing the `fancy` looks without -real deep understanding of the LLM workflow. -The existing optimization process of LLM applications are full of frustrations. -Quality over Quantity +Principle 2: Optimizing over Building ----------------------------------------------------------------------- - The Quality of core building blocks over the Quantity of integrations. + We help users build the task pipeline, but we want to help with optimizing even more so. + + -The whole `PyTorch` library is built on a few core and base classes: ``Module``, ``Tensor``, ``Parameter``, and ``Optimizer``, -and various ``nn`` modules for users to build a model, along with ``functionals``. -This maps to ``Component``, ``DataClass``, ``Parameter``, and ``Optimizer`` in LightRAG, and various subcomponents -like ``Generator``, ``Retriever``, ``Prompt``, ``Embedder``, ``ModelClient``, along with ``functionals`` to process string, -interprect tool from the string. +We will design our building blocks with `optimization` in mind. +This means beyond giving developers transparency and control, providing them with great `logging`, `observability`, `configurability`, `optimizers`, and `trainers` +to ease the existing frustrations of optimizing the task pipeline. -We recognize developers who are building real-world Large Language Model (LLM) applications are the real heroes, doing the hard -work. They need well-designed core building blocks: **easy** to understand, **transparent** to debug, **flexible** enough to customize their own -``ModelClient``, their own ``Prompt``, their own ``Generator`` and even their own ``Optimizer``, ``Trainer``. The need to build their own component is even more so than using `PyTorch.` -LightRAG aggressively focus on the quality and clarity of the core building blocks over the quantity of integrations. -Practicality over Showmanship +Principle 3: Practicality over Showmanship ----------------------------------------------------------------------- We put these three hard rules while designing LightRAG: @@ -56,7 +38,75 @@ We put these three hard rules while designing LightRAG: +Our deep understanding of LLM workflow +----------------------------------------------------------------------- + +The above principles are distilled from our deep understanding of the LLM workflow. + + +**Developers are the ultimate heroes** + +LLMs are like `water`, they can almost do anything, from GenAI applications such as `chatbot`, `translation`, `summarization`, `code generation`, `autonomous agent` to classical NLP tasks like `text classification`, and `named entity recognition`. +They interact with the world beyond the model's internal knowledge via `retriever`, `memory`, and `tools` (`function calls`). +Each use case is unique in its data, its business logic, and its unique user experience. + + +Building LLM applications is a combination of software engineering and modeling (in-context learning). +Libraries like `PyTorch` mainly provide basic building blocks and do the heavy lifting on computation optimization. +If 10% of all `PyTorch` users need to customize a layer or an optimizer, the chance of customizing will only be higher for LLM applications. +Any library aiming to provide out-of-box solutions is destined to fail as it is up to the developers to address each unique challenge. + + + +**Manual prompt engineering vs Auto-prompt optimization** + +Developers rely on prompting to shape the LLMs into their use cases via In-context learning (ICL). +However, LLM prompting is highly sensitive: the accuracy gap between top-performing and lower-performing prompts can be as high as 40%. +It is also a brittle process that breaks the moment your model changes. +Because of this, developers end up spending **10%** of their time building the task pipeline itself, but the other **90%** in optimizing and iterating the prompt. +The process of closing the accuracy gap between the demo to the production is full of frustrations. +There is no doubt that the future of LLM applications is in auto-prompt optimization, not manual prompt engineering. +However, researchers are still trying to understand prompt engineering, the process of automating it is even more in its infancy state. + +**Know where the heavy lifting is** + +The heavy lifting of an LLM library is not to provide developers out-of-box prompts, not on intergrations of different API providers or data bases, it is on: + +- Core base classes and abstractions to help developers on "boring" things like seralization, deserialization, standarizing interfaces, data processing. +- Building blocks to help LLMs interact with the world. +- `Evaluating` and `optimizing` the task pipeline. + +All while giving full control of the prompt and the task pipeline to the developers. + + + + + +.. raw:: + + [Optional] Side story: How `LightRAG` is born +.. ---------------------------------------------- + +.. The whole `PyTorch` library is built on a few core and base classes: ``Module``, ``Tensor``, ``Parameter``, and ``Optimizer``, +.. and various ``nn`` modules for users to build a model, along with ``functionals``. +.. This maps to ``Component``, ``DataClass``, ``Parameter``, and ``Optimizer`` in LightRAG, and various subcomponents +.. like ``Generator``, ``Retriever``, ``Prompt``, ``Embedder``, ``ModelClient``, along with ``functionals`` to process string, +.. interprect tool from the string. + +.. We recognize developers who are building real-world Large Language Model (LLM) applications are the real heroes, doing the hard +.. work. They need well-designed core building blocks: **easy** to understand, **transparent** to debug, **flexible** enough to customize their own +.. ``ModelClient``, their own ``Prompt``, their own ``Generator`` and even their own ``Optimizer``, ``Trainer``. The need to build their own component is even more so than using `PyTorch.` +.. LightRAG aggressively focus on the quality and clarity of the core building blocks over the quantity of integrations. + +.. the current state of the art in auto-prompt optimization is still in its infancy. +.. Though Auto-prompt optimization is the future, now we are still in the process of understanding more on prompt engineering itself and but it is a good starting point for auto-prompt optimization. + +.. The future is at the optimizing. +.. Using LLMs via apis or local LLMs is easy, so where is the value of having a library like `LightRAG`? + +.. In `PyTorch`, most likely users do not need to build their own ``conv`` or ``linear`` module, or their own ``Adam`` optimizer. +.. The existing building blocks can meet > 90% users' needs, leaving less than 10% of users, mostly contributors and researchers to build their own `Module`, `Tensor`, +.. `Optimizer`, etc. Excellent libraries like `PyTorch`, `numpy`, `scipy`, `sklearn`, `pandas` are all doing the heavy lifting on the computation optimization. -[Optional] Side story: How `LightRAG` is born ----------------------------------------------- \ No newline at end of file +.. Using LLMs via apis or local LLMs is easy, so where is the heavy lifting in the LLM applications? diff --git a/docs/source/developer_notes/logging_tracing.rst b/docs/source/developer_notes/logging_tracing.rst index 5ea0af87..3a3ded40 100644 --- a/docs/source/developer_notes/logging_tracing.rst +++ b/docs/source/developer_notes/logging_tracing.rst @@ -1,2 +1,170 @@ Logging & Tracing -================== \ No newline at end of file +================== + +Tracing +^^^^^^^^^^^ +In particular, we provide two tracing methods to help you develop and improve the Generator: + +1. Trace the history change(states) on prompt during your development process. Developers typically go through a long process of prompt optimization and it is frustrating +to lose track of the prompt changes when your current change actually makes the performance much worse. + +We created a `GeneratorStateLogger` to handle the logging and saving into json files. To further simplify developers's process, +we provides a class decorator `trace_generator_states` where a single line of code can be added to any of your task component. +It will automatically track any attributes of type `Generator`. + +.. code-block:: python + + from tracing import trace_generator_states + from core import Component, Generator + + @trace_generator_states() + class SimpleQA(Component): + def __init__(self): + super().__init__() + self.generator = Generator(...) + self.generator_2 = Generator(...) + def call(...): + +In default, a dir from the current working directory will be created to store the log files. +The project name in defaul is `SimpleQA` and the log file will be named as `generator_state_trace.json` +where both the `generator` and `generator_2` will be logged. +The structure of log directory is as follows: + +.. code-block:: bash + + . + ├── traces + │ ├── SimpleQA + │ │ ├── generator_state_trace.json + + + +Here is an example log file: + +.. code-block:: json + + { + "generator": [ + { + "prompt_states": { + "_components": {}, + "_parameters": {}, + "training": false, + "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", + "prompt_variables": [ + "chat_history_str", + "context_str", + "examples_str", + "input_str", + "output_format_str", + "output_str", + "steps_str", + "task_desc_str", + "tools_str" + ], + "preset_prompt_kwargs": { + "task_desc_str": "You are a helpful assistant and with a great sense of humor." + } + }, + "time_stamp": "2024-06-02T15:55:21.765794" + }, + { + "prompt_states": { + "_components": {}, + "_parameters": {}, + "training": false, + "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", + "prompt_variables": [ + "chat_history_str", + "context_str", + "examples_str", + "input_str", + "output_format_str", + "output_str", + "steps_str", + "task_desc_str", + "tools_str" + ], + "preset_prompt_kwargs": { + "task_desc_str": "You are a helpful assistant and with a great sense of humor. Second edition." + } + }, + "time_stamp": "2024-06-02T15:56:37.756148" + } + ], + "generator2": [ + { + "prompt_states": { + "_components": {}, + "_parameters": {}, + "training": false, + "_template_string": "{# task desc #}\n{% if task_desc_str %}\n{{task_desc_str}}\n{% else %}\nAnswer user query.\n{% endif %}\n{# output format #}\n{% if output_format_str %}\n\n{{output_format_str}}\n\n{% endif %}\n{# tools #}\n{% if tools_str %}\n\n{{tools_str}}\n\n{% endif %}\n{# example #}\n{% if examples_str %}\n\n{{examples_str}}\n\n{% endif %}\n{# chat history #}\n{% if chat_history_str %}\n\n{{chat_history_str}}\n\n{% endif %}\n{#contex#}\n{% if context_str %}\n\n{{context_str}}\n\n{% endif %}\n{# steps #}\n{% if steps_str %}\n\n{{steps_str}}\n\n{% endif %}\n{% if input_str %}\n\n{{input_str}}\n\n{% endif %}\n{% if output_str %}\n\n{{output_str}}\n\n{% endif %}\n", + "prompt_variables": [ + "chat_history_str", + "context_str", + "examples_str", + "input_str", + "output_format_str", + "output_str", + "steps_str", + "task_desc_str", + "tools_str" + ], + "preset_prompt_kwargs": { + "task_desc_str": "You are the second generator." + } + }, + "time_stamp": "2024-06-03T16:44:45.223220" + } + ] + } + +2. Trace all failed LLM predictions for further improvement. + +Similarly, :class:`tracing.generator_call_logger.GeneratorCallLogger` is created to log generator call input arguments and output results. +`trace_generator_call` decorator is provided to provide one-line setup to trace calls, which in default will log only failed predictions. + +Adding the second decorator to the above example: + +.. code-block:: python + + from tracing import trace_generator_errors + + @trace_generator_call() + @trace_generator_states() + class SimpleQA(Component): + def __init__(self): + super().__init__() + self.generator = Generator(...) + self.generator_2 = Generator(...) + def call(...): + +Now, three more files will be created in the log directory: + +.. code-block:: bash + + . + ├── traces + │ ├── SimpleQA + │ │ ├── logger_metadata.json + │ │ ├── generator_call.jsonl + │ │ ├── generator_2_call.jsonl + +The `logger_metadata.json` file contains the metadata of the logger, it looks like this: + +.. code-block:: json + + { + "generator": "./traces/SimpleQA/generator_call.jsonl", + "generator2": "./traces/SimpleQA/generator2_call.jsonl" + } + +The `generator_call.jsonl` file contains the log of all calls to the generator, it looks like this: + +.. code-block:: json + + {"prompt_kwargs": {"input_str": "What is the capital of France?"}, "model_kwargs": {}, "output": {"data": "Bonjour!\n\nThe capital of France is Paris, of course! But did you know that the Eiffel Tower in Paris is actually the most-visited paid monument in the world? Mind-blowing, right?\n\nNow, would you like to know some more fun facts or perhaps ask another question? I'm all ears (or should I say, all eyes?)", "error_message": null, "raw_response": "Bonjour!\n\nThe capital of France is Paris, of course! But did you know that the Eiffel Tower in Paris is actually the most-visited paid monument in the world? Mind-blowing, right?\n\nNow, would you like to know some more fun facts or perhaps ask another question? I'm all ears (or should I say, all eyes?)"}, "time_stamp": "2024-06-03T16:44:45.582859"} + +.. note :: + + Usually, let the evaluation run on evaluation to collect as much as failed predictions can be highly helpful for either manual prompting or auto-prompt engineering (APE). diff --git a/docs/source/developer_notes/model_client.rst b/docs/source/developer_notes/model_client.rst index 98b21bea..abad8f02 100644 --- a/docs/source/developer_notes/model_client.rst +++ b/docs/source/developer_notes/model_client.rst @@ -1,5 +1,11 @@ ModelClient ============ + +.. admonition:: Author + :class: highlight + + `Li Yin `_ + What you will learn? 1. What is ``ModelClient`` and why is it designed this way? @@ -23,7 +29,7 @@ Because so, by switching off ``ModelClient`` in a ``Generator`` or ``Embedder`` Model Inference SDKs ------------------------ -With cloud API providers like OpenAI, Groq, Anthropic, it often comes with a `sync` and an `async` client via their SDKs. +With cloud API providers like OpenAI, Groq, Anthropic, it often comes with a `sync` and an `async` client via their SDKs. For example: .. code-block:: python @@ -33,7 +39,7 @@ For example: sync_client = OpenAI() async_client = AsyncOpenAI() - # sync call using APIs + # sync call using APIs response = sync_client.chat.completions.create(...) For local models, such as using `huggingface transformers`, you need to create this model inference SDKs yourself. @@ -141,7 +147,7 @@ This is how `OpenAIClient` implements these methods along with ``__init__`` meth class OpenAIClient(ModelClient): def __init__(self, api_key: Optional[str] = None): - + super().__init__() self._api_key = api_key self.sync_client = self.init_sync_client() @@ -175,7 +181,7 @@ This is how ``TransformerClient`` does the same thing: } def init_sync_client(self): - return TransformerEmbedder() + return TransformerEmbedder() Second. we use `convert_inputs_to_api_kwargs` for subclass to convert LightRAG inputs into the `api_kwargs` (SDKs arguments). @@ -202,7 +208,7 @@ This is how `OpenAIClient` implements this method: model_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED, ) -> Dict: - + final_model_kwargs = model_kwargs.copy() if model_type == ModelType.EMBEDDER: if isinstance(input, str): @@ -314,8 +320,8 @@ Here is an example to use ``OpenAIClient`` directly, first on LLM model: prompt = f"User: {query}\n" model_kwargs = {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100} - api_kwargs = openai_client.convert_inputs_to_api_kwargs(input=prompt, - model_kwargs=model_kwargs, + api_kwargs = openai_client.convert_inputs_to_api_kwargs(input=prompt, + model_kwargs=model_kwargs, model_type=model_type) print(f"api_kwargs: {api_kwargs}") @@ -325,10 +331,10 @@ Here is an example to use ``OpenAIClient`` directly, first on LLM model: The output will be: -.. code-block:: +.. code-block:: api_kwargs: {'model': 'gpt-3.5-turbo', 'temperature': 0.5, 'max_tokens': 100, 'messages': [{'role': 'system', 'content': 'User: What is the capital of France?\n'}]} - response_text: The capital of France is Paris. + response_text: The capital of France is Paris. Then on Embedder model: diff --git a/docs/source/developer_notes/optimizer.rst b/docs/source/developer_notes/optimizer.rst index e6201a16..d26f6ad0 100644 --- a/docs/source/developer_notes/optimizer.rst +++ b/docs/source/developer_notes/optimizer.rst @@ -1,2 +1,2 @@ -Optimizer - The Future of LLM applications -========================================================== \ No newline at end of file +Optimizer +========================================================== diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst new file mode 100644 index 00000000..dc1f8e4c --- /dev/null +++ b/docs/source/developer_notes/output_parsers.rst @@ -0,0 +1,2 @@ +OutputParser +============= diff --git a/docs/source/developer_notes/prompt.rst b/docs/source/developer_notes/prompt.rst index d5624f4e..dfc9532f 100644 --- a/docs/source/developer_notes/prompt.rst +++ b/docs/source/developer_notes/prompt.rst @@ -1,169 +1,251 @@ Prompt ============ -We strick to maximize developers' control towards the final experience and performance, simplify the development process, and minimize the token consumption. +.. admonition:: Author + :class: highlight -For the major chat models, we eventually will only send two messages to the model: the system message and the user message. The user message is simple, -often you have a message `{'role': 'user', 'content': 'Hello, how are you?'}`. The system message is more complex, it contains the task description, tools, examples, chat history, context, and -intermediate step history from agents. + `Li Yin `_ -Prompt template ---------------------- +Context +---------------- -Our `DEFAULT_LIGHTRAG_SYSTEM_PROMPT` templates the system prompt with 7 important sections. We leverage `jinjia2` template for **programmable prompt** right along with string. +The prompt refers to the text input to the LLM models. +When sent to an LLM, the model uses the prompt to auto-regressively generate the next tokens, continuing the process until it reaches a specified stopping criterion. +The prompt itself plays a crucial role in the performance of the desired tasks. +Researchers often use `special tokens` [1]_ to separate different sections of the prompt, such as the system message, user message, and assistant message. +Ideally, developers should format this prompt with special tokens specific to the model's at training time. +However, many proprietary APIs did not disclose their special tokens, and requires users to send them in the forms of messages of different roles. -The default template comes with 7 variables: `task_desc_str`, `output_format_str`, `tools_str`, `examples_str`, `chat_history_str`, `context_str`, and `steps_str`. +Design +---------------- + +`LightRAG` seeks to maximize developers' control over the prompt. +Thus, in most cases, we help developers gather different sections and form them into one prompt. +This prompt will then be send to the LLM as a single message. +The default role of the message we use is `system`. +Though it is not a special token, we use ```` to represent the system message in the prompt, which works quite well. -A jinjia2 template will rendered with :ref:`Prompt` class. If some fields being empty, that section will be empty in the final prompt string. .. code-block:: python - :linenos: - DEFAULT_LIGHTRAG_SYSTEM_PROMPT = r"""{# task desc #} - {% if task_desc_str %} - {{task_desc_str}} - {% endif %} - {# tools #} - {% if tools_str %} - - {{tools_str}} - - {% endif %} - {# example #} - {% if examples_str %} - - {{examples_str}} - - {% endif %} - {# chat history #} - {% if chat_history_str %} - - {{chat_history_str}} - - {% endif %} - {#contex#} - {% if context_str %} - - {{context_str}} - - {% endif %} - {# steps #} - {% if steps_str %} - - {{steps_str}} - - {% endif %} - """ - -Across our library, here our advanced features: - -- Various output formats where the `output_format_str` variable is used to pass the output format to the model. - -- Few-shot and Many-shots In-context Learning (ICL) where the `examples_str` variable is used to pass the examples to the model. - -- Tools/Function Calls where the `tools_str` variable is used to pass the tools to the model. - -- Memory where the `chat_history_str` variable is used to pass the memory to the model. - -- Retrieval augmented generation(RAG) where the `context_str`` variable is used to pass the retrieved context. - -- Agent with multiple step planning and replanning capabilities, where the `steps_str` variable is used to pass the previous steps to the model. - -**Note: this means in default our out-of-box components would not support API providers's tools/function calls as we only send the system and user messages to the model. -But it should not stop you from implementing them yourself.** + simple_prompt = r""" You are a helpful assistant. User: What can you help me with?""" -Prompt class ---------------------- -We designed a :ref:`Prompt` class to render the `template` with the variables to string as the final system prompt. In the simplest case, the string is empty and we will only send -a user message to the model. And in most cases, you want to add at least the `task_desc_str` to the system message. +If it is `Llama3` model, the final text sent to the model for tokenization will be: + +.. code-block:: python -The cool thing about our `Prompt` system is how flexible it can be. If you need to put another `template` for say `task_desc_str`, you can do that using the `Prompt` class. -For example, your task is to instruct the llm to choose `top_k` from the given choices, you can define a new template like this: + final_prompt = r"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + {{simple_prompt}} <|eot_id|>""" + +And the LLM will return the following text: .. code-block:: python - :linenos: - from core.prompt_builder import Prompt + prediction = r"""<|start_header_id|>assistant<|end_header_id|> You can ask me anything you want. <|eot_id|><|end_of_text|>""" + +Data Flow in LLM applications +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. figure:: /_static/images/LightRAG_dataflow.png + :align: center + :alt: Data Flow in LLM applications + :width: 620px + + Data flow in LLM applications - task_desc_template = r""" - Choose the top {{top_k}} from the following choices: {{choices}} - """ - top_k = 3 - choices = ['apple', 'banana', 'orange', 'grape'] - task_desc_prompt = Prompt(template=task_desc_template, preset_prompt_kwargs={'top_k': top_k, 'choices': choices}) - task_desc_str = task_desc_prompt.call() - prompt = Prompt(preset_prompt_kwargs={'task_desc_str': task_desc_str}) - prompt.print_prompt() +Look at the most complicated case: We will have user query, retrieved context, task description, definition of tools, few-shot examples, past conversation history, step history from the agent, and the output format specification. +All these different parts need to be formatted into a single prompt. +We have to do all this with flexibility and also make it easy for developers to read. -The output would be: -.. code-block:: xml + +Why Jinja2? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To format the prompt, you can use any of Python's native string formatting. + +.. code-block:: python :linenos: - Choose the top 3 from the following choices: ['apple', 'banana', 'orange', 'grape'] + # percent(%) formatting + print("%s User: %s" % (task_desc_str, input_str)) + # format() method with kwargs + print( + "{task_desc_str} User: {input_str}".format( + task_desc_str=task_desc_str, input_str=input_str + ) + ) + # f-string + print(f"{task_desc_str} User: {input_str}") + # Templates + from string import Template -Prompt and Special Tokens context ----------------------------------- + t = Template("$task_desc_str User: $input_str") + print(t.substitute(task_desc_str=task_desc_str, input_str=input_str)) -Each section other than `task_desc_str` is encapulated in a special token. Different model can have different special tokens. -Here is one example of `Llama3 Documentation `_ prompts formatted with special tokens: +We opted for `Jinja2` [1]_ as the templating engine for the prompt. +Besides the placeholders using ``{{}}`` for keyword arguments, Jinja2 also allow users to write code similar to Python syntax. +This includes conditionals, loops, filters, and even comments, which are lacking in Python's native string formatting. +Here is one example of using `Jinja2` to format the prompt: -input string to the LLM model and minimize the token consumption. -We enable advanced features without relying on API provider's prompt manipulation such as `OpenAI`'s tools or assistant APIs. -.. code-block:: - :linenos: +.. code-block:: python + + def jinja2_template_example(**kwargs): + from jinja2 import Template - <|begin_of_text|><|start_header_id|>system<|end_header_id|> + template = r"""{{ task_desc_str }} + {# tools #} + {% if tools %} + + {% for tool in tools %} + {{loop.index}}. {{ tool }} + {% endfor %} + + {% endif %} + User: {{ input_str }}""" + t = Template(template, trim_blocks=True, lstrip_blocks=True) + print(t.render(**kwargs)) - You are a helpful AI assistant for travel tips and recommendations<|eot_id|> - - <|start_header_id|>user<|end_header_id|> - What can you help me with?<|eot_id|> +Let's call it with and without tools: + +.. code-block:: python - <|start_header_id|>assistant<|end_header_id|> + jinja2_template_example(task_desc_str=task_desc_str, input_str=input_str) + jinja2_template_example( + task_desc_str=task_desc_str, input_str=input_str, tools=tools + ) +The printout would be: +.. code-block:: + You are a helpful assitant + User: What is the capital of France? +And with tools: -Here is how you customize a new prompt: +.. code-block:: + + You are a helpful assitant + + 1. google + 2. wikipedia + 3. wikidata + + User: What is the capital of France? + +We can see how easy and flexible to programmatically format the prompt with `Jinja2`. + + + +Prompt class +---------------- + + +We created our :class:`Prompt Component` to render the prompt with the string ``template`` and ``prompt_kwargs``. +It is a simple component, but it is quite handy. +Let's use the same template as above: .. code-block:: python - :linenos: - from core.prompt_builder import Prompt + from lightrag.core.prompt_builder import Prompt + + prompt = Prompt( + template=template, + prompt_kwargs={ + "task_desc_str": task_desc_str, + "tools": tools, + }, + ) + print(prompt) + print(prompt(input_str=input_str)) # takes the rest arguments in keyword arguments + +The ``Prompt`` class allow us to preset some of the prompt arguments at initialization, and then we can call the prompt with the rest of the arguments. +Also, by subclassing ``Component``, we can easily visualize this component with ``print``. +Here is the output: + +.. code-block:: + + Prompt( + template: {{ task_desc_str }} + {# tools #} + {% if tools %} + + {% for tool in tools %} + {{loop.index}}. {{ tool }} + {% endfor %} + + {% endif %} + User: {{ input_str }}, prompt_kwargs: {'task_desc_str': 'You are a helpful assitant', 'tools': ['google', 'wikipedia', 'wikidata']}, prompt_variables: ['input_str', 'tools', 'task_desc_str'] + ) + +As with all components, you can use ``to_dict`` and ``from_dict`` to serialize and deserialize the component. + +Default Prompt Template +------------------------- + +In default, the ``Prompt`` class uses the :const:`DEFAULT_LIGHTRAG_SYSTEM_PROMPT` as its string template if no template is provided. +This default template allows you to conditionally passing seven important variables designed from the data flow diagram above. +These varaibles are: + +.. code-block:: python + + LIGHTRAG_DEFAULT_PROMPT_ARGS = [ + "task_desc_str", # task description + "output_format_str", # output format of the task + "tools_str", # tools used in the task + "examples_str", # examples of the task + "chat_history_str", # chat history of the user + "context_str", # context of the user query + "steps_str", # used in agent steps + "input_str", # user query or input + ] + +Now, let's see the minimum case where we only have the user query: + +.. code-block:: python + + prompt = Prompt() + output = prompt(input_str=input_str) + print(output) + +The output will be the bare minimum with only the user query and a prefix for assistant to respond: + +.. code-block:: - new_template = r""" - <|begin_of_text|><|start_header_id|>system<|end_header_id|> - {{task_desc_str}} - Your context: {{context_str}} <|eot_id|> + + What is the capital of France? + + You: - <|start_header_id|>user<|end_header_id|> - {{query_str}}<|eot_id|> +.. note:: - <|start_header_id|>assistant<|end_header_id|> - """ + In reality, we barely need to use the raw ``Prompt`` class directly as it is orchestrated by the ``Generator`` component together with the ``ModelClient`` that we will introduce next. - prompt = Prompt(template=new_template) -Prompt Engineering experience -------------------------------- -There is not robust prompt, and it is one of the most sensitive creatures in the AI world. -Here are some tips: -- Even the output format matters, the order of your output fields, the formating. -Output yaml or json format can lead to different performance. We have better luck with yaml format. -- Few-shot works so well in some case, but it can lead to regression in some cases. -- It is not fun to be a prompt engineer! But what can we do for now. +.. Prompt Engineering experience +.. ------------------------------- +.. There is no robust prompt, and it is one of the most sensitive creatures in the AI world. +.. Here are some tips: +.. - Even the output format matters, the order of your output fields, the formating. Output yaml or json format can lead to different performance. We have better luck with yaml format. +.. - Few-shot works so well in some case, but it can lead to regression in some cases. +.. - It is not fun to be a prompt engineer! But what can we do for now. -Resources: -1. `Jinja2`: +.. admonition:: References + :class: highlight + .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ + .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ +.. admonition:: API References + :class: highlight + - :class:`core.prompt_builder.Prompt` + - :const:`core.default_prompt_template.DEFAULT_LIGHTRAG_SYSTEM_PROMPT` diff --git a/docs/source/developer_notes/react_agent_xy.rst b/docs/source/developer_notes/react_agent_xy.rst index f3c0599a..c5440dca 100644 --- a/docs/source/developer_notes/react_agent_xy.rst +++ b/docs/source/developer_notes/react_agent_xy.rst @@ -19,7 +19,7 @@ Introduction ----------------------- Before explaining ``LightRAG Agent`` implementation, here is a quick introduction of ReAct Agent. -To solve a query, the `ReAct Agent `_, like its name(``Re``- Reason; ``Act`` - Act), +To solve a query, the `ReAct Agent `_, like its name(``Re``- Reason; ``Act`` - Act), first uses LLM to analyze the context and plan actions to answer the query(reasoning). Then it takes actions to utilize external resources(action). For more details, please see the :ref:`deep-dive`. @@ -27,13 +27,13 @@ LightRAG's Implementation ----------------------------------------------------- Next, let's look at how ``LightRAG`` makes the implementation convenient. In ``LightRAG``, the ReAct agent is a type of :ref:`generator` that runs multiple sequential steps to generate the final response, with designed prompt, external functions(named as ``tools``) and ``JsonParser output_processors``. -1. **Prompt:** We have a easy-to-customizable prompt template designed for ReAct agent that takes in -``tools``, few shot ``examples``, ``history``, and ``user query``. -The ``history`` will be automatically managed by the agent. ``user query`` will be handled in each single turn. -Hence when initializing an agent, we only need to set up the ``tools`` and the ``examples`` in the ``preset_prompt_kwargs`` for the system prompt and +1. **Prompt:** We have a easy-to-customizable prompt template designed for ReAct agent that takes in +``tools``, few shot ``examples``, ``history``, and ``user query``. +The ``history`` will be automatically managed by the agent. ``user query`` will be handled in each single turn. +Hence when initializing an agent, we only need to set up the ``tools`` and the ``examples`` in the ``preset_prompt_kwargs`` for the system prompt and use ``user query`` in each agent call. :ref:`Prompt `. -2. **Tools:** ReAct Agent needs to plan the tool to use, which means it needs to access the tools' descriptions. +2. **Tools:** ReAct Agent needs to plan the tool to use, which means it needs to access the tools' descriptions. ``LightRAG`` provides dynamic tool handling, using ``FunctionTool`` to encapsulate tool functionalities. The metadata(function name, description, and parameters) will be extracted and passed to the prompt automatically. This process not only makes tool integration more seamless but also enhances developer efficiency by allowing straightforward definition and management of tools. Here is the example to illustrate the usage of ``FunctionTool``. It's easy to set up using ``from_defaults``. @@ -73,19 +73,19 @@ Here is the example to illustrate the usage of ``FunctionTool``. It's easy to se # Function parameter: {"type": "object", "properties": {"a": {"type": "int"}, "b": {"type": "int"}}, "required": ["a", "b"]} The agent will then call these external functions based on the function descriptions. -In addition to user-defined tools, the :class:`ReActAgent ` built-in ``llm_tool`` +In addition to user-defined tools, the :class:`ReActAgent ` built-in ``llm_tool`` for leveraging LLM's internal knowledge, and ``finish`` for completing processes. ``llm_tool`` uses the same model with the agent. Developers have the flexibility to enable or disable these as needed. 3. **Output Parser:** ``LightRAG`` requests the model to output intermediate Thought and Action as JSON, which facilitates better error handling and easier data manipulation than strings. For example, - + .. code-block:: json - + { "thought": "", "action": "ToolName(, )" } -This format allows the ``LightRAG`` JSON parser to efficiently decode the model's output and extract arguments. +This format allows the ``LightRAG`` JSON parser to efficiently decode the model's output and extract arguments. The parsed data is then utilized by the ``StepOutput`` class to manage the flow of thought, action and observation. 4. **Example:** Let's see a Q&A agent example: @@ -93,7 +93,7 @@ The parsed data is then utilized by the ``StepOutput`` class to manage the flow .. code-block:: python from lightrag.core.tool_helper import FunctionTool - from lightrag.components.agent.react_agent import ReActAgent + from lightrag.components.agent.react import ReActAgent from lightrag.components.model_client import OpenAIClient from lightrag.components.model_client import GroqAPIClient @@ -121,8 +121,8 @@ The parsed data is then utilized by the ``StepOutput`` class to manage the flow # print(f"Function name: {name}") # print(f"Function description: {description}") # print(f"Function parameter: {parameter}") - - + + examples = [ """ User: What is 9 - 3? @@ -147,13 +147,13 @@ The parsed data is then utilized by the ``StepOutput`` class to manage the flow preset_prompt_kwargs=preset_prompt_kwargs ) - import time + import time queries = ["What is 3 add 4?", "3*9=?"] average_time = 0 for query in queries: t0 = time.time() answer = agent(query) - + # Answer: The answer is 7. # Answer: The answer is 27. @@ -166,7 +166,7 @@ ReAct Agent Deep Dive --------------------------- Please read this section if you need more information on ReAct agent. -`ReAct Agent `_, like its name(``Re``- Reason; ``Act`` - Act), is a framework generating reasoning and taking actions in an interleaved manner. The reasoning step guides the model to action plans and the action step allows the agent to interact with external sources such as knowledge bases. +`ReAct Agent `_, like its name(``Re``- Reason; ``Act`` - Act), is a framework generating reasoning and taking actions in an interleaved manner. The reasoning step guides the model to action plans and the action step allows the agent to interact with external sources such as knowledge bases. The paper shows: 1. ReAct with few-shot prompt and Wikipedia API interaction outperforms chain-of-thought on `HotpotQA `_ (Question and Answering) and `Fever `_ (Fact Verification). @@ -184,7 +184,7 @@ The environment contains user query, step histories, observations, and external At each step, the agent: -- **[Thought]** In response to the environment and user query, the agent uses its LLM to generate a strategic thought that outlines a plan or hypothesis guiding the subsequent action. +- **[Thought]** In response to the environment and user query, the agent uses its LLM to generate a strategic thought that outlines a plan or hypothesis guiding the subsequent action. - **[Action]** The agent executes the action. @@ -192,9 +192,9 @@ The environment will be updated: - **[Observation]** The observation is created after the action is done. -Then the agent iteratively generates thoughts based on latest observation and context(previous steps), takes actions and gets new observations. +Then the agent iteratively generates thoughts based on latest observation and context(previous steps), takes actions and gets new observations. -The termination condition is: +The termination condition is: * The agent finds the answer and takes "finish" action. @@ -202,10 +202,10 @@ The termination condition is: **2. Action Space** -Now we understand the 3 different stages: Thought, Action, Observation. Let's focus on Action, one of agents' uniqueness. +Now we understand the 3 different stages: Thought, Action, Observation. Let's focus on Action, one of agents' uniqueness. Actions refer to the tools the agent uses to interact with the environment and creates observations. -Note: the paper defines Thought(or reasoning trace) as a *language level action* but it is not included in the action space because it doesn't impact the environment. +Note: the paper defines Thought(or reasoning trace) as a *language level action* but it is not included in the action space because it doesn't impact the environment. Use ``HotpotQA`` dataset as an example, what external source do we need to answer questions? @@ -213,11 +213,11 @@ Use ``HotpotQA`` dataset as an example, what external source do we need to answe In the `ReAct paper `_, researchers include 3 actions in the "action space" (simplified version here): -* search[entity], returns the first 5 sentences from the corresponding entity wiki page if it exists, or else suggests top-5 similar entities. +* search[entity], returns the first 5 sentences from the corresponding entity wiki page if it exists, or else suggests top-5 similar entities. -* lookup[string], simulating Ctrl+F functionality on the browser. +* lookup[string], simulating Ctrl+F functionality on the browser. -* finish[answer], which would finish the current task with answer. +* finish[answer], which would finish the current task with answer. **3. Components** @@ -227,5 +227,5 @@ With the workflow and action space, next, let's focus on the components needed t * **function call:** In the implementation, each action is essentially a function to call. Clear functionality definition is important for the agent to determine which action to take next. -* **parser:** The agent is built on LLMs. It takes in the prompt with context, generates thought and determine the action to take in text response. -To really call functions, we need to parse the text response to get the parameters for the determined function. \ No newline at end of file +* **parser:** The agent is built on LLMs. It takes in the prompt with context, generates thought and determine the action to take in text response. +To really call functions, we need to parse the text response to get the parameters for the determined function. diff --git a/docs/source/developer_notes/retriever.rst b/docs/source/developer_notes/retriever.rst index e1cd2354..d99201f5 100644 --- a/docs/source/developer_notes/retriever.rst +++ b/docs/source/developer_notes/retriever.rst @@ -1,6 +1,11 @@ Retriever ============ +.. admonition:: Author + :class: highlight + + `Li Yin `_ + Context ------------------ @@ -183,7 +188,7 @@ Optionally, the subclass can implement ``save_to_file`` and ``load_from_file`` t As the retriever is a subclass of component, you already inherited powerful serialization and deserialization methods such as ``to_dict``, ``from_dict``, and ``from_config`` to help with the saving and loading process. As for helper attributes, we have ``indexed`` and ``index_keys`` to differentiate if the retriever is ready for retrieval and the attributes that are key to restore the functionality/states of the retriever. It is up the subclass to decide how to decide the storage of the index, it can be in-memory, local disk, or cloud storage, or save as json or pickle file or even a db table. -As an example, :class:`components.retriever.bm25_retriever.InMemoryBM25Retriever` has the following key attributes to index. +As an example, :class:`components.retriever.bm25_retriever.BM25Retriever` has the following key attributes to index. .. code:: python @@ -194,7 +199,7 @@ Retriever in Action -------------------- All of our retrievers are subclassed from the base retriever, and they are located in the ``components.retriever`` module. You can skim through their implementations here: :ref:`retriever`. -Currently only :class:`components.retriever.faiss_retriever.InMemoryBM25Retriever` needs to have its own ``save_to_file`` and ``load_from_file`` to avoid recomputation again. +Currently only :class:`components.retriever.faiss_retriever.BM25Retriever` needs to have its own ``save_to_file`` and ``load_from_file`` to avoid recomputation again. The ``FAISSRetriever`` will work with a database instead to store the embeddings and it alleviates the need for the retriever to deal with states saving. In this note, we will use the following documents and queries for demonstration: @@ -307,7 +312,7 @@ You can check the retriever for more type of scores. BM25Retriever ^^^^^^^^^^^^^^^^^^^^^^^^ -So the semantic search works pretty well. We will see how :class:`components.retriever.bm25_retriever.InMemoryBM25Retriever` works in comparison. +So the semantic search works pretty well. We will see how :class:`components.retriever.bm25_retriever.BM25Retriever` works in comparison. We reimplemented the code in [9]_ with one improvement: instead of using ``text.split(" ")``, we use tokenizer to split the text. Here is a comparison of how they different: .. code-block:: python @@ -328,11 +333,11 @@ We prepare the retriever: .. code-block:: python - from lightrag.components.retriever import InMemoryBM25Retriever + from lightrag.components.retriever import BM25Retriever document_map_func = lambda x: x["content"] - bm25_retriever = InMemoryBM25Retriever(top_k=2, documents=documents, document_map_func=document_map_func) + bm25_retriever = BM25Retriever(top_k=2, documents=documents, document_map_func=document_map_func) print(bm25_retriever) It takes ``document_map_func`` to map the documents to the text format the retriever can work with. @@ -340,7 +345,7 @@ The output is: .. code-block:: - InMemoryBM25Retriever(top_k=2, k1=1.5, b=0.75, epsilon=0.25, use_tokenizer=True, total_documents=4) + BM25Retriever(top_k=2, k1=1.5, b=0.75, epsilon=0.25, use_tokenizer=True, total_documents=4) Now we call the retriever exactly the same way as we did with the FAISS retriever: @@ -613,6 +618,6 @@ Additionally, ``LocalDB`` help us keep track of our initial documents and its tr - :class:`core.retriever.Retriever` - :ref:`core.types` - :class:`components.retriever.faiss_retriever.FAISSRetriever` - - :class:`components.retriever.bm25_retriever.InMemoryBM25Retriever` + - :class:`components.retriever.bm25_retriever.BM25Retriever` - :class:`components.retriever.reranker_retriever.RerankerRetriever` - - :class:`components.retriever.llm_retriever.LLMRetriever` \ No newline at end of file + - :class:`components.retriever.llm_retriever.LLMRetriever` diff --git a/docs/source/developer_notes/retriever_xy.rst b/docs/source/developer_notes/retriever_xy.rst index 2230edc5..ff67c41b 100644 --- a/docs/source/developer_notes/retriever_xy.rst +++ b/docs/source/developer_notes/retriever_xy.rst @@ -5,7 +5,7 @@ In this tutorial, we will explain each component in ``LightRAG's Retriever`` and LLMs develop fast, but they have limitations. -**Content Window Limit:** Although the trend is, LLM models' content window keeps growing, there is still a context limit. +**Content Window Limit:** Although the trend is, LLM models' content window keeps growing, there is still a context limit. **Signal to Noise Ratio** Meanwhile, LLMs perform better when the provided contents are relevant to the task. @@ -18,8 +18,8 @@ The common solution for Retrieval is to chunk the documents into smaller context 1. Document Splitter ---------------------- -The DocumentSplitter in LightRAG is designed to preprocess text by splitting long documents into smaller chunks. -This improves the performance of embedding models and ensures they operate within their maximum context length limits. +The DocumentSplitter in LightRAG is designed to preprocess text by splitting long documents into smaller chunks. +This improves the performance of embedding models and ensures they operate within their maximum context length limits. ``LightRAG's DocumentSplitter`` splits a list of documents (:obj:`core.base_data_class.Document`) into a list of shorter documents. The document object to manage id, document content,optional meta data, document's embedding vectors, etc. @@ -50,16 +50,16 @@ Check the following table for ``split_by`` options: - ```` - ``Hello, world. This is LightRAG.`` to ``['Hello, ', 'world. ', 'This ', 'is ', 'LightRAG.']`` -We will use ``word`` in our example. +We will use ``word`` in our example. -* ``split_length`` is the the maximum number of units in each split. +* ``split_length`` is the the maximum number of units in each split. * ``split_overlap`` is the number of units that each split should overlap. Including context at the borders prevents sudden meaning shift in text between sentences/context, especially in sentiment analysis. In ``LightRAG`` we use ``windowed`` function in ``more-itertools`` package to build a sliding window for the texts to keep the overlaps. The window step size = ``split_length - split_overlap``. After splitting the long text into a list and using a sliding window to generate the text lists with specified overlap length, the text list will be concatenated into text pieces again. Here is a quick example: -``Review: The theater service is terrible. The movie is good.`` Set ``split_by: word``, ``split_length: 6``, ``split_overlap: 2``. +``Review: The theater service is terrible. The movie is good.`` Set ``split_by: word``, ``split_length: 6``, ``split_overlap: 2``. With our ``DocumentSplitter`` logic, the output will be: ``Review: The theater service is terrible.``, ``is terrible. The movie is good.`` It prevents the model of misunderstand the context. If we don't have overlap, the second sentence will be ``The movie is good.`` and the embedding model might only consider this document is merely ``Positive``. @@ -119,7 +119,7 @@ Now you can use the splitter to create document chunks. ---------------- Now we have splitted long documents to shorter ones, the next part is to retrieve the relevant documents. -But how can we find "relevant" texts? A commonly applied approach in the NLP field is Embedding. +But how can we find "relevant" texts? A commonly applied approach in the NLP field is Embedding. For ``Embedder`` tutorial, please check `Embedder <./embedder.html>`_. @@ -127,9 +127,9 @@ For ``Embedder`` tutorial, please check `Embedder <./embedder.html>`_. ------------------------ Given a query, the retriever is responsible to fetch the relevant documents. Now we have document splitter and embedder, we can check the retrievers now. -LightRAG provides ``FAISSRetriever``, ``InMemoryBM25Retriever``, and ``LLMRetriever``. -These retrievers are built on the basic :class:`Retriever`, with default index building and retrieve phases. -All these retrievers return a list of ``RetrieverOutput``, including indexes, scores, query and documents. +LightRAG provides ``FAISSRetriever``, ``BM25Retriever``, and ``LLMRetriever``. +These retrievers are built on the basic :class:`Retriever`, with default index building and retrieve phases. +All these retrievers return a list of ``RetrieverOutput``, including indexes, scores, query and documents. #. FAISSRetriever @@ -164,7 +164,7 @@ Here is an example: import os os.environ["KMP_DUPLICATE_LIB_OK"] = "True" - # To use ``FAISSRetriever``, we need to prepare the embeddings + # To use ``FAISSRetriever``, we need to prepare the embeddings # for documents or chunks following the previous steps. # configure the splitter setting @@ -184,7 +184,7 @@ Here is an example: doc1 = Document( meta_data={"title": "Luna's Profile"}, text="lots of more nonsense text." * 50 - + "Luna is a domestic shorthair." + + "Luna is a domestic shorthair." + "lots of nonsense text." * 100 + "Luna loves to eat Tuna." + "lots of nonsense text." * 50, @@ -195,7 +195,7 @@ Here is an example: text="lots of more nonsense text." * 50 + "Luna loves to eat lickable treats." + "lots of more nonsense text." * 50 - + "Luna loves to play cat wand." + + "Luna loves to play cat wand." + "lots of more nonsense text." * 50 + "Luna likes to sleep all the afternoon", id="doc2", @@ -238,7 +238,7 @@ Here is an example: ) # build indexes for the documents - faiss_retriever.build_index_from_documents(embeddings) + faiss_retriever.build_index_from_documents(embeddings) # set up queries queries = ["what does luna like to eat?"] @@ -256,7 +256,7 @@ Here is an example: for idx in result.doc_indexes: print(f"Document ID: {splitted_docs[idx].id} - Title: {splitted_docs[idx].meta_data['title']}") print(f"Text: {splitted_docs[idx].text}") # Print first 200 characters of the document text - + print(f"*" * 50) # ************************************************** @@ -264,14 +264,14 @@ Here is an example: # Query: what does luna like to eat? # Document Indexes: [8 2], Scores: [0.741 0.724] # Document ID: e3f04c8b-68ae-4dde-844a-439037e58842 - Title: Luna's Hobbies - # Text: text. Luna loves to eat lickable treats.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more + # Text: text. Luna loves to eat lickable treats.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more # Document ID: f2d0f52a-4e69-4cc5-8f78-4499fa22525d - Title: Luna's Profile - # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots + # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots # ************************************************** -#. InMemoryBM25Retriever +#. BM25Retriever -The ``InMemoryBM25Retriever`` leverages the `Okapi BM25 algorithm(Best Matching 25 ranking) `_, a widely-used ranking function in information retrieval that is particularly effective in contexts where document relevance to a query is crucial. +The ``BM25Retriever`` leverages the `Okapi BM25 algorithm(Best Matching 25 ranking) `_, a widely-used ranking function in information retrieval that is particularly effective in contexts where document relevance to a query is crucial. This retriever is initialized with parameters that fine-tune its behavior: @@ -281,7 +281,7 @@ This retriever is initialized with parameters that fine-tune its behavior: ``alpha``: Sets a cutoff for the IDF scores, filtering out terms that are too common to be informative. IDF refers to `Inverse document frequency `_. It measures how much information the word provides. Lower the IDF score means the word is used a lot and less important in the document. -Please check :class:`InMemoryBM25Retriever` to see how we calculate the IDF score. +Please check :class:`BM25Retriever` to see how we calculate the IDF score. ``split_function``: Tokenization is customizable via the ``split_function``, which defaults to splitting text by tokens. Here's an example using a custom tokenizer: The following example shows how the token splitting works. This tokenizer converts text into a series of token IDs, which are numeric representations of the tokens. @@ -300,31 +300,31 @@ The following example shows how the token splitting works. This tokenizer conver Tokenization can be customized through ``split_function``. -Similar to ``FAISSRetriever``, developers can build index from documents. In ``InMemoryBM25Retriever`` allows direct documents inputs without need for preparing embeddings beforehand. +Similar to ``FAISSRetriever``, developers can build index from documents. In ``BM25Retriever`` allows direct documents inputs without need for preparing embeddings beforehand. The ``build_index_from_documents`` first tokenizes the documents, then analyzes each to compute token frequencies necessary for IDF calculation. And we filter the IDF based on the specified ``alpha``. -The ``t2d`` represents the token and its frequency in documents. +The ``t2d`` represents the token and its frequency in documents. For example, t2d={"apple":{0:1}} means, the word apple appears once in the 0th document. With the frequency we can calculate idf. The ``idf`` dictionary is to record the idf score for each token, such as {"apple": 0.9}, it means in the corpus, the token apple has idf score=0.9. ``load_index``, ``save_index`` and ``reset_index`` are supported. -When a query is received, each token of the query is first transformed into its corresponding token using the same ``split_function`` configured during initialization. +When a query is received, each token of the query is first transformed into its corresponding token using the same ``split_function`` configured during initialization. If a token from the query also appears in the documents of the corpus, -the retriever iterates over the documents containing the token, -applying the BM25 formula to calculate and accumulate scores based on the token's frequency. -For instance, document 1 = "apple, apple, banana", document 2 = "apple, orange". +the retriever iterates over the documents containing the token, +applying the BM25 formula to calculate and accumulate scores based on the token's frequency. +For instance, document 1 = "apple, apple, banana", document 2 = "apple, orange". If the query is "apple, orange", the score of document 1 be the accumulated score from 2 "apple". The score of document 2 will be the accumulated score from "apple" and "orange". -The document's score increases for each occurrence of these tokens. -This cumulative scoring approach ensures that documents containing more query-related tokens are ranked higher. -Finally, the ``k`` documents with the highest cumulative scores are identified and returned in a ``RetrieverOutput``, +The document's score increases for each occurrence of these tokens. +This cumulative scoring approach ensures that documents containing more query-related tokens are ranked higher. +Finally, the ``k`` documents with the highest cumulative scores are identified and returned in a ``RetrieverOutput``, which means most relevant to the query. #. LLMRetriever -Unlike ``FAISSRetriever`` and ``InMemoryBM25Retriever``, the ``LLMRetriever`` utilizes LLM models to perform retrieval. +Unlike ``FAISSRetriever`` and ``BM25Retriever``, the ``LLMRetriever`` utilizes LLM models to perform retrieval. This model-driven approach does not rely on traditional similarity/IDF scores but instead uses the model's understanding of the content. @@ -359,7 +359,7 @@ Here is an example for ``LLMRetriever``: documents = [ Document(id="doc1", meta_data={"title": "Luna's Profile"}, text= "lots of more nonsense text." * 50 - + "Luna is a domestic shorthair." + + "Luna is a domestic shorthair." + "lots of nonsense text." * 50 + "Luna loves to eat Tuna." + "lots of nonsense text." * 50), @@ -367,7 +367,7 @@ Here is an example for ``LLMRetriever``: "lots of more nonsense text." * 50 + "Luna loves to eat lickable treats." + "lots of more nonsense text." * 50 - + "Luna loves to play cat wand." + + "Luna loves to play cat wand." + "lots of more nonsense text." * 50 + "Luna likes to sleep all the afternoon"), ] @@ -420,9 +420,9 @@ Here is an example for ``LLMRetriever``: # ************************************************** # Query: what does luna like to eat? # Document ID: 557cc52b-a2b7-4780-bbc3-f1be8330c167 - Title: Luna's Profile - # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.Luna loves to eat Tuna.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense + # Text: text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.Luna loves to eat Tuna.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense # ************************************************** # Query: what does Luna look like? # Document ID: 7de4b00a-e539-4df0-adc9-b4c312bed365 - Title: Luna's Profile - # Text: text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense - # ************************************************** \ No newline at end of file + # Text: text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.lots of more nonsense text.Luna is a domestic shorthair.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense text.lots of nonsense + # ************************************************** diff --git a/docs/source/developer_notes/text_splitter.rst b/docs/source/developer_notes/text_splitter.rst index ff7afc9d..b6904110 100644 --- a/docs/source/developer_notes/text_splitter.rst +++ b/docs/source/developer_notes/text_splitter.rst @@ -7,13 +7,13 @@ Text Splitter In this tutorial, we will learn: -#. Why do we need the ``TextSplitter`` +#. TextSplitter Overview -#. How does ``LightRAG's TextSplitter`` work +#. How does it work -#. How to implement ``LightRAG's TextSplitter`` +#. How to use it -Why do we need the ``TextSplitter`` +TextSplitter Overview ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ LLMs’s context window is limited and the performance often drops with very long and nonsense input. Shorter content is more manageable and fits memory constraint. @@ -22,195 +22,97 @@ The goal of the text splitter is to chunk large data into smaller ones, potentia The ``TextSplitter`` is designed to efficiently process and chunk **plain text**. It leverages configurable separators to facilitate the splitting of :obj:`document object ` into smaller manageable document chunks. -How does ``LightRAG's TextSplitter`` work +How does it work ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``TextSplitter`` supports 2 types of splitting. - -* Type 1: Specify the exact text splitting point such as space<" "> and periods<".">. It is intuitive: -"Hello, world!" -> ["Hello, " ,"world!"] - -* Type 2: Use :class:`tokenizer `. It works as: -"Hello, world!" -> ['Hello', ',', ' world', '!'] -This aligns with how models see text in the form of tokens. (`Reference `_) - -Simple text splitting can underestimate the number of tokens. Tokenizer reflects the real token numbers the models take in. -But the Tokenizer here only works on world level. - -* **Overview**: ``TextSplitter`` first utilizes ``split_by`` to specify the text-splitting criterion and breaks the long text into smaller texts. Then we create a sliding window with length= ``chunk_size``. It moves at step= ``chunk_size`` - ``chunk_overlap``. -The texts inside each window will get concatenated to a smaller chunk. The generated chunks from the splitted text will be returned. +The texts inside each window will get merged to a smaller chunk. The generated chunks from the splitted text will be returned. -Here are some Definitions: +**Splitting Types** -* **Definitions** +``TextSplitter`` supports 2 types of splitting. -``split_by``: Specifies the text-splitting criterion using predefined keys like "word", "sentence", "page", "passage", and "token". The splitter utilizes the corresponding separator from the ``SEPARATORS`` dictionary. - -``SEPARATORS``: Maps ``split_by`` criterions to their exact text separators, e.g., spaces<" "> for "word" or periods<"."> for "sentence". +* **Type 1:** Specify the exact text splitting point such as space<" "> and periods<".">. It is intuitive, for example, split_by "word": -Usage: **SEPARATORS[``split_by``]=separator** +:: -.. note:: - For option ``token``, its separator is "" because we directly split by a tokenizer, instead of text point. - -* ``split_by`` specifies the separator by which the document should be split, i.e. the smallest unit during splitting. -For Type 1 splitting, we apply ``Python str.split()`` to break the text. -Check the following table for ``split_by`` options: - -.. list-table:: Text Splitting Options - :widths: 10 15 75 - :header-rows: 1 + "Hello, world!" -> ["Hello, " ,"world!"] - * - ``split_by`` Option - - Actual Separator - - Example - * - **page** - - ``\f`` - - ``Hello, world!\fNew page starts here.`` to ``['Hello, world!\x0c', 'New page starts here.']`` - * - **passage** - - ``\n\n`` - - ``Hello, world!\n\nNew paragraph starts here`` to ``['Hello, world!\n\n', 'New paragraph starts here.']`` - * - **sentence** - - ``.`` - - ``Hello, world. This is LightRAG.`` to ``['Hello, world.', ' This is LightRAG.', '']`` - * - **word** - - ```` - - ``Hello, world. This is LightRAG.`` to ``['Hello, ', 'world. ', 'This ', 'is ', 'LightRAG.']`` +* **Type 2:** Use :class:`tokenizer `. It works as: -* ``chunk_size`` is the the maximum number of units in each chunk. +:: -* ``chunk_overlap`` is the number of units that each chunk should overlap. Including context at the borders prevents sudden meaning shift in text between sentences/context, especially in sentiment analysis. + "Hello, world!" -> ['Hello', ',', ' world', '!'] -Here is an example of how ``chunk_size`` works with ``chunk_overlap``: +This aligns with how models see text in the form of tokens (`Reference `_), +Tokenizer reflects the real token numbers the models take in and helps the developers control budgets. -.. code-block:: python - from lightrag.core.text_splitter import TextSplitter - from lightrag.core.types import Document +**Definitions** + +* **split_by** specifies the split rule, i.e. the smallest unit during splitting. We support ``"word"``, ``"sentence"``, ``"page"``, ``"passage"``, and ``"token"``. The splitter utilizes the corresponding separator from the ``SEPARATORS`` dictionary. +For Type 1 splitting, we apply ``Python str.split()`` to break the text. - # configure the splitter setting - text_splitter_settings = { - "split_by": "word", - "chunk_size": 5, - "chunk_overlap": 2, - } +* **SEPARATORS**: Maps ``split_by`` criterions to their exact text separators, e.g., spaces <" "> for "word" or periods <"."> for "sentence". - # set up the document splitter - text_splitter = TextSplitter( - split_by=text_splitter_settings["split_by"], - chunk_size=text_splitter_settings["chunk_size"], - chunk_overlap=text_splitter_settings["chunk_overlap"], - ) - doc1 = Document( - text="Hello, this is lightrag. Please implement your splitter here.", - id="doc1", - ) +.. note:: + For option ``token``, its separator is "" because we directly split by a tokenizer, instead of text point. - documents = [doc1] +* **chunk_size** is the the maximum number of units in each chunk. - splitted_docs = (text_splitter.call(documents=documents)) +* **chunk_overlap** is the number of units that each chunk should overlap. Including context at the borders prevents sudden meaning shift in text between sentences/context, especially in sentiment analysis. - for doc in splitted_docs: - print(doc.text) - # Output: - # Hello, this is lightrag. Please - # lightrag. Please implement your splitter - # your splitter here. -In this case, when splitting by ``word`` with ``chunk_size``=5 and ``chunk_overlap``=2, -each chunk will repeat 2 words from the previous chunk. These 2 words are set by ``chunk_overlap``. -This means each chunk has ``5-2=3`` word(split unit) difference compared with its previous. +Here are examples of how ``split_by``, ``chunk_size`` works with ``chunk_overlap``. +Document Text: -.. note:: - ``chunk_overlap`` should always be smaller than ``chunk_size``, otherwise the window won't move and the splitting stucks. - - -One more example on ``split_by=token``: +:: + + Hello, this is lightrag. Please implement your splitter here. -.. code-block:: python - # configure the splitter setting - text_splitter_settings = { - "split_by": "token", - "chunk_size": 5, - "chunk_overlap": 2, - } - - # set up the document splitter - text_splitter = TextSplitter( - ... - ) - doc1 = Document( - text="Hello, this is lightrag. Please implement your splitter here.", - id="doc1", - ) - documents = [doc1] - splitted_docs = (text_splitter.call(documents=documents)) +.. list-table:: Chunking Example Detailed + :widths: 15 15 15 55 + :header-rows: 1 - for doc in splitted_docs: - print(doc.text) - # Output: - # Hello, this is lightrag. Please - # lightrag. Please implement your splitter - # your splitter here. -In this case, when splitting by ``word`` with ``chunk_size``=5 and ``chunk_overlap``=2, + * - Split By + - Chunk Size + - Chunk Overlap + - Resulting Chunks + * - word + - 5 + - 2 + - "Hello, this is lightrag. Please", "lightrag. Please implement your splitter", "your splitter here." + * - sentence + - 1 + - 0 + - "Hello, this is lightrag.", "Please implement your splitter here." + * - token + - 5 + - 2 + - "Hello, this is l", "is lightrag.", "trag. Please implement your", "implement your splitter here." + +When splitting by ``word`` with ``chunk_size`` = 5 and ``chunk_overlap`` = 2, each chunk will repeat 2 words from the previous chunk. These 2 words are set by ``chunk_overlap``. This means each chunk has ``5-2=3`` word(split unit) difference compared with its previous. -.. note:: - ``chunk_overlap`` should always be smaller than ``chunk_size``, otherwise the window won't move and the splitting stucks. - - -One more example on ``split_by=token``: - -.. code-block:: python - # configure the splitter setting - text_splitter_settings = { - "split_by": "token", - "chunk_size": 5, - "chunk_overlap": 2, - } - - # set up the document splitter - text_splitter = TextSplitter( - ... - ) - - doc1 = Document( - text="Hello, this is lightrag. Please implement your splitter here.", - id="doc1", - ) - documents = [doc1] - splitted_docs = (text_splitter.call(documents=documents)) - for doc in splitted_docs: - print(doc.text) - # Output: - # Hello, this is l - # is lightrag. - # trag. Please implement your - # implement your splitter here. When splitting using tokenizer, each chunk still keeps 5 tokens. -Since ``lightrag`` -> ['l', 'igh', 'trag'], the second chunk is actually ``is`` + ``l`` + ``igh`` + ``trag`` + ``.``. +For example, the tokenizer transforms ``lightrag`` to ['l', 'igh', 'trag']. So the second chunk is actually ``is`` + ``l`` + ``igh`` + ``trag`` + ``.``. .. note:: - The punctuation is considered as a token. - -This splitting aligns with how models see text in the form of tokens. (`Reference `_) - -Simple text splitting(Type 1) can underestimate the number of tokens. Tokenizer reflects the real token numbers the models take in. -But the Tokenizer here only works at world level. + ``chunk_overlap`` should always be smaller than ``chunk_size``, otherwise the window won't move and the splitting stucks. + When ``split_by`` = ``token``, the punctuation is considered as a token. -How to implement ``LightRAG's TextSplitter`` +How to use it ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ What you need is to specify the arguments and input your documents this way: .. code-block:: python - from lightrag.core.text_splitter import TextSplitter + from lightrag.components.data_process.text_splitter import TextSplitter from lightrag.core.types import Document # Configure the splitter settings text_splitter = TextSplitter( - split_by="sentence", + split_by="word", chunk_size=5, chunk_overlap=1 ) @@ -227,6 +129,11 @@ What you need is to specify the arguments and input your documents this way: for doc in splitted_docs: print(doc) + # Output: + # Document(id=44a8aa37-0d16-40f0-9ca4-2e25ae5336c8, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None) + # Document(id=ca0af45b-4f88-49b5-97db-163da9868ea4, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None) + # Document(id=e7b617b2-3927-4248-afce-ec0fc247ac8b, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None) + Integration with Other Document Types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This functionality is ideal for segmenting texts into sentences, words, pages, or passages, which can then be processed further for NLP applications. diff --git a/docs/source/developer_notes/tool_helper.rst b/docs/source/developer_notes/tool_helper.rst new file mode 100644 index 00000000..07cc1c3b --- /dev/null +++ b/docs/source/developer_notes/tool_helper.rst @@ -0,0 +1,856 @@ +Function calls +=========================== +.. admonition:: Author + :class: highlight + + `Li Yin `_ + +Tools are means LLM can use to interact with the world beyond of its internal knowledge. Technically speaking, retrievers are tools to help LLM to get more relevant context, and memory is a tool for LLM to carry out a conversation. +Deciding when, which, and how to use a tool, and even to creating a tool is an agentic behavior: +Function calls is a process of showing LLM a list of funciton definitions and prompt it to choose one or few of them. +Many places use tools and function calls interchangably. + +In this note we will covert function calls, including + +1. Function call walkthrough +2. Overall design +3. Function call in action + + +Quick Walkthrough +-------------------- +Users might already know of OpenAI's function call feature via its API (https://platform.openai.com/docs/guides/function-calling). + +.. code-block:: python + + def get_current_weather(location, unit="fahrenheit"): + """Get the current weather in a given location""" + import json + + if "tokyo" in location.lower(): + return json.dumps({"location": "Tokyo", "temperature": "10", "unit": unit}) + elif "san francisco" in location.lower(): + return json.dumps( + {"location": "San Francisco", "temperature": "72", "unit": unit} + ) + elif "paris" in location.lower(): + return json.dumps({"location": "Paris", "temperature": "22", "unit": unit}) + else: + return json.dumps({"location": location, "temperature": "unknown"}) + +For the above function, it is shown to LLM in the following format: + +.. code-block:: python + + function_definition = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + } + + +Then the API will respond with a list of function names and parameters: + +.. code-block:: python + + Function(arguments='{"location": "San Francisco, CA"}', name='get_current_weather') + +Then the output will need to be parsed into arguments which are then passed to the function: + +.. code-block:: python + + function_name = tool_call.function.name + function_to_call = available_functions[function_name] + function_args = json.loads(tool_call.function.arguments) + function_response = function_to_call( + location=function_args.get("location"), + unit=function_args.get("unit"), + ) + +LightRAG Scope and Design +---------------------------- +Even with API, users have to (1) create the function definition, (2) Parse the response, (3) Execute the function. +What is missing in using API is: (1) How the function definitions are shown to LLM, (2) How the output format is instructured. + +LightRAG will provide built-in capabilities to do function calls simplily via prompt without relying on the tools API. + +**Design Goals** + +Asking LLM to call a function with keyword arguments is the simplest way of achieving the function call. +But it is limiting: + +1. What if the argument value is a more complicated data structure? +2. What if you want to use a variable as an argument? + +LightRAG will also provide ``FunctionExpression`` where calling a function is asking LLM to write the code snippet of the function call directly: + +.. code-block:: python + + 'get_current_weather("San Francisco, CA", unit="celsius")' + +This is not only more flexible, but also it is also a more efficient/compact way to call a function. + +.. As a library, we prioritize the built-in function call capabilities via the normal prompt-response. +.. Function calls are often just a prerequisite for more complext agent behaviors. +.. This means we need to know how to form a ``prompt``, how to define ``functions`` or ``tools``, how to parse them out from the response, and how to execute them securely in your LLM applications. +.. We encourage our users to handle function calls on their own and we make the effort to make it easy to do so. + +.. 1. Get **maximum control and transparency** over your prompt and for researchers to help improve these capabilities. +.. 2. Model-agnositc: Can switch to any model, either local or API based, without changing the code. +.. 3. More powerful. + + + +**Data Models** + +We have four ``DataClass`` models: :class:`core.types.FunctionDefinition`, :class:`core.types.Function`, :class:`core.types.FunctionExpression`, and :class:`core.types.FunctionOutput` to handle function calls. + +These classes not only help with data structuring but also by being a subclass of ``DataClass``, it can be easily used in the prompt. +``Function`` has three important attributes: ``name``, ``args``, and ``kwargs`` for the function name, positional arguments and keyword arguments. +``FunctionExpression`` only has one action for the function call expression. +Both can be used to format the output in the prompt. We will demonstrate how to use it later. + +**Components** + +We have two components: :class:`core.func_tool.FunctionTool` and :class:`core.tool_manager.ToolManager` to streamline the lifecyle of (1) +creating the function definition (2) formatting the prompt with the definitions and output format (3) parsing the response (4) executing the function. + +``FunctionTool`` is a container of a single function. It handles the function definition and executing of the function. It supports both sync and async functions. +``ToolManager`` manages all tools. And it handles the execution and context_map that is used to parse the functions sercurely. + +``ToolManager`` is simplified way to do function calls. + +.. list-table:: + :header-rows: 1 + + * - + - Attribute/Method + - Description + * - Attributes + - ``tools`` + - A list of tools managed by ToolManager. Each tool is an instance or a derivative of ``FunctionTool``. + * - + - ``context`` + - A dictionary combining tool definitions and additional context, used for executing function expressions. + * - Methods + - ``__init__`` + - Initializes a new ToolManager instance with tools and additional context. Tool can be ``FunctionTool`` or any function. + * - + - ``yaml_definitions`` + - Returns the YAML definitions of all tools managed by ToolManager. + * - + - ``json_definitions`` + - Returns the JSON definitions of all tools managed by ToolManager. + * - + - ``function_definitions`` + - Returns a list of function definitions for all tools. + * - + - ``parse_func_expr`` + - Parses a ``FunctionExpression`` and returns a ``Function`` object ready for execution. + * - + - ``execute_func`` + - Executes a given ``Function`` object and returns its output wrapped in ``FunctionOutput``. Support both sync and async functions. + * - + - ``execute_func_expr`` + - Parses and executes a ``FunctionExpression`` directly, returning the execution result as ``FunctionOutput``. Support both sync and async functions. + * - + - ``execute_func_expr_via_sandbox`` + - Execute the function expression via sandbox. Only support sync functions. + * - + - ``execute_func_expr_via_eval`` + - Execute the function expression via eval. Only support sync functions. + +Function Call in Action +-------------------------- + +We will use the following functions as examples across this note: + +.. code-block:: python + + from dataclasses import dataclass + import numpy as np + import time + import asyncio + + + def multiply(a: int, b: int) -> int: + """Multiply two numbers.""" + time.sleep(1) + return a * b + + + def add(a: int, b: int) -> int: + """Add two numbers.""" + time.sleep(1) + return a + b + + + async def divide(a: float, b: float) -> float: + """Divide two numbers.""" + await asyncio.sleep(1) + return float(a) / b + + + async def search(query: str) -> List[str]: + """Search for query and return a list of results.""" + await asyncio.sleep(1) + return ["result1" + query, "result2" + query] + + + def numpy_sum(arr: np.ndarray) -> float: + """Sum the elements of an array.""" + return np.sum(arr) + + + x = 2 + + @dataclass + class Point: + x: int + y: int + + + def add_points(p1: Point, p2: Point) -> Point: + return Point(p1.x + p2.x, p1.y + p2.y) + +We delibrately cover both async and sync, examples of using variables and more complicated data structures as arguments. +We will demonstrate the structure and how to use each data model and component to call the above functions in different ways. + +1. FunctionTool +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +First, let's see how we help describe the function to LLM. + +Use the above functions as examples, ``FunctionTool`` will generate the ``FunctionDefinition`` for each function automatically if the user did not pass it in. + +.. code-block:: python + + from lightrag.core.func_tool import FunctionTool + + functions =[multiply, add, divide, search, numpy_sum, add_points] + tools = [ + FunctionTool(fn=fn) for fn in functions + ] + for tool in tools: + print(tool) + +The printout shows three attributes for each function: ``fn``, ``_is_async``, and ``definition``. + +.. code-block:: + + FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})) + FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})) + FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']})) + FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='search', func_desc='search(query: str) -> List[str]\nSearch for query and return a list of results.', func_parameters={'type': 'object', 'properties': {'query': {'type': 'str'}}, 'required': ['query']})) + FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='numpy_sum', func_desc='numpy_sum(arr: numpy.ndarray) -> float\nSum the elements of an array.', func_parameters={'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']})) + FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add_points', func_desc='add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\nNone', func_parameters={'type': 'object', 'properties': {'p1': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}, 'p2': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}}, 'required': ['p1', 'p2']})) + +View the definition for ``add_point`` and also the ``get_current_weather`` function in dict format: + +.. code-block:: python + + print(tools[-2].definition.to_dict()) + +The output will be: + +.. code-block:: + + { + "func_name": "numpy_sum", + "func_desc": "numpy_sum(arr: numpy.ndarray) -> float\nSum the elements of an array.", + "func_parameters": { + "type": "object", + "properties": {"arr": {"type": "ndarray"}}, + "required": ["arr"], + }, + } + +Using ``to_json`` and ``to_yaml`` will directly get us the string that can be fed into the prompt. +And we prefer to use ``yaml`` format here as it is more token efficient: + + +We choose to describe the function not only with the docstring which is `Sum the elements of an array.` but also with the function signature which is `numpy_sum(arr: numpy.ndarray) -> float`. +This will give the LLM a view of the function at the code level and it helps with the function call. + +.. note:: + Users should better use type hints and a good docstring to help LLM understand the function better. + +In comparison, here is our definition for ``get_current_weather``: + +.. code-block:: + + { + "func_name": "get_current_weather", + "func_desc": "get_current_weather(location, unit='fahrenheit')\nGet the current weather in a given location", + "func_parameters": { + "type": "object", + "properties": { + "location": {"type": "Any"}, + "unit": {"type": "Any", "default": "fahrenheit"}, + }, + "required": ["location"], + }, + } + +To execute function using function names requres us to manage a function map. Instead of using the raw function, we use ``FunctionTool`` instead for this context map. + +.. code-block:: python + + context_map = {tool.definition.func_name: tool for tool in tools} + +To execute a function, we can do: + +.. code-block:: python + + function_name = "add" + function_to_call = context_map[function_name] + function_args = {"a": 1, "b": 2} + function_response = function_to_call.call(**function_args) + +If we use async function, we can use ``acall``. +``execute`` is a wrapper that you can call a function in both sync and async way regardless of the function type. +Check out the API documentation for more details. + +2. ToolManager +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Using ``ToolManager`` on all the above function: + +.. code-block:: python + + from lightrag.core.tool_manager import ToolManager + + tool_manager = ToolManager(tools=functions) + print(tool_manager) + +The tool manager can take both ``FunctionTool``, function and async function. +The printout: + +.. code-block:: + + ToolManager(Tools: [FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']})), FunctionTool(fn: , async: True, definition: FunctionDefinition(func_name='search', func_desc='search(query: str) -> List[str]\nSearch for query and return a list of results.', func_parameters={'type': 'object', 'properties': {'query': {'type': 'str'}}, 'required': ['query']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='numpy_sum', func_desc='numpy_sum(arr: numpy.ndarray) -> float\nSum the elements of an array.', func_parameters={'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']})), FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='add_points', func_desc='add_points(p1: __main__.Point, p2: __main__.Point) -> __main__.Point\nNone', func_parameters={'type': 'object', 'properties': {'p1': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}, 'p2': {'type': 'Point', 'properties': {'x': {'type': 'int'}, 'y': {'type': 'int'}}, 'required': ['x', 'y']}}, 'required': ['p1', 'p2']}))], Additional Context: {}) + + + +We will show more how it can be used in the next section. + +3. Function Call end-to-end +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Now, let us add prompt and start to do function calls via LLMs. +We use the following prompt to do a single function call. + +.. code-block:: python + + template = r"""You have these tools available: + {% if tools %} + + {% for tool in tools %} + {{ loop.index }}. + {{tool}} + ------------------------ + {% endfor %} + + {% endif %} + + {{output_format_str}} + + + User: {{input_str}} + You: + """ + +**Pass tools in the prompt** + +We use `yaml` format here and show an example with less tools. + +.. code-block:: python + + from lightrag.core.prompt_builder import Prompt + + prompt = Prompt(template=template) + small_tool_manager = ToolManager(tools=tools[:2]) + + renered_prompt = prompt(tools=small_tool_manager.yaml_definitions) + print(renered_prompt) + +The output is: + +.. code-block:: + + You have these tools available: + + 1. + func_name: multiply + func_desc: 'multiply(a: int, b: int) -> int + + Multiply two numbers.' + func_parameters: + type: object + properties: + a: + type: int + b: + type: int + required: + - a + - b + + ------------------------ + 2. + func_name: add + func_desc: 'add(a: int, b: int) -> int + + Add two numbers.' + func_parameters: + type: object + properties: + a: + type: int + b: + type: int + required: + - a + - b + + ------------------------ + + + None + + + User: None + You: + +**Pass the output format** + +We have two ways to instruct LLM to call the function: + +1. Using the function name and arguments, we will leverage ``Function`` as LLM's output data type. + +.. code-block:: python + + from lightrag.core.types import Function + + output_data_class = Function + output_format_str = output_data_class.to_json_signature(exclude=["thought", "args"]) + + renered_prompt= prompt(output_format_str=output_format_str) + print(renered_prompt) + +We execluded both the ``thought`` and ``args`` as it is easier to use ``kwargs`` to represent the arguments. +The output is: + +.. code-block:: + + You have these tools available: + + { + "name": "The name of the function (str) (optional)", + "kwargs": "The keyword arguments of the function (Optional) (optional)" + } + + + User: None + You: + + + +2. Using the function call expression for which we will use ``FunctionExpression``. + +.. code-block:: python + + from lightrag.core.types import FunctionExpression + + output_data_class = FunctionExpression + output_format_str = output_data_class.to_json_signature(exclude=["thought"]) + print(prompt(output_format_str=output_format_str)) + +The output is: + +.. code-block:: + + You have these tools available: + + { + "action": "FuncName() Valid function call expression. Example: \"FuncName(a=1, b=2)\" Follow the data type specified in the function parameters. e.g. for Type object with x,y properties, use \"ObjectType(x=1, y=2) (str) (required)" + } + + + User: None + You: + +We will use :class:`components.output_parsers.outputs.JsonOutputParser` to streamline the formatting of our output data type. + +.. code-block:: python + + from lightrag.components.output_parsers import JsonOutputParser + + func_parser = JsonOutputParser(data_class=Function) + instructions = func_parser.format_instructions(exclude=["thought", "args"]) + print(instructions) + +The output is: + +.. code-block:: + + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "name": "The name of the function (str) (optional)", + "kwargs": "The keyword arguments of the function (Optional) (optional)" + } + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions. + + +Function Output Format +************************************************** +Now, let's prepare our generator with the above prompt, ``Function`` data class, and ``JsonOutputParser``. + +.. code-block:: python + + from lightrag.core.generator import Generator + from lightrag.core.types import ModelClientType + + model_kwargs = {"model": "gpt-3.5-turbo"} + prompt_kwargs = { + "tools": tool_manager.yaml_definitions, + "output_format_str": func_parser.format_instructions( + exclude=["thought", "args"] + ), + } + generator = Generator( + model_client=ModelClientType.OPENAI(), + model_kwargs=model_kwargs, + template=template, + prompt_kwargs=prompt_kwargs, + output_processors=func_parser, + ) + +**Run Queries** + +We will use ``Function.from_dict`` to get the final output type from the json object. Here we use :meth:`core.tool_manager.ToolManager.execute_func` to execute it directly. + +.. code-block:: python + + queries = [ + "add 2 and 3", + "search for something", + "add points (1, 2) and (3, 4)", + "sum numpy array with arr = np.array([[1, 2], [3, 4]])", + "multiply 2 with local variable x", + "divide 2 by 3", + "Add 5 to variable y", + ] + + for idx, query in enumerate(queries): + prompt_kwargs = {"input_str": query} + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs=prompt_kwargs) + # print(f"LLM raw output: {result.raw_response}") + func = Function.from_dict(result.data) + print(f"Function: {func}") + func_output = tool_manager.execute_func(func) + print(f"Function output: {func_output}") + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + +From the output shown below, we get valide ``Function`` parsed as output for all queries. +However, we see it failed three function execution: +(1)function `add_points` due to its argument type is a data class, and `multiply` and the last `add` due to it is difficult to represent the local variable `x` and `y` in the function call. + +.. code-block:: + + 0 Query: add 2 and 3 + -------------------------------------------------- + Function: Function(thought=None, name='add', args=[], kwargs={'a': 2, 'b': 3}) + Function output: FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 2, 'b': 3}), parsed_input=None, output=5, error=None) + + 1 Query: search for something + -------------------------------------------------- + Function: Function(thought=None, name='search', args=[], kwargs={'query': 'something'}) + Function output: FunctionOutput(name='search', input=Function(thought=None, name='search', args=(), kwargs={'query': 'something'}), parsed_input=None, output=['result1something', 'result2something'], error=None) + + 2 Query: add points (1, 2) and (3, 4) + -------------------------------------------------- + Function: Function(thought=None, name='add_points', args=[], kwargs={'p1': {'x': 1, 'y': 2}, 'p2': {'x': 3, 'y': 4}}) + Error at calling : 'dict' object has no attribute 'x' + Function output: FunctionOutput(name='add_points', input=Function(thought=None, name='add_points', args=(), kwargs={'p1': {'x': 1, 'y': 2}, 'p2': {'x': 3, 'y': 4}}), parsed_input=None, output=None, error="'dict' object has no attribute 'x'") + + 3 Query: sum numpy array with arr = np.array([[1, 2], [3, 4]]) + -------------------------------------------------- + Function: Function(thought=None, name='numpy_sum', args=[], kwargs={'arr': [[1, 2], [3, 4]]}) + Function output: FunctionOutput(name='numpy_sum', input=Function(thought=None, name='numpy_sum', args=(), kwargs={'arr': [[1, 2], [3, 4]]}), parsed_input=None, output=10, error=None) + + 4 Query: multiply 2 with local variable x + -------------------------------------------------- + Function: Function(thought=None, name='multiply', args=[], kwargs={'a': 2, 'b': 'x'}) + Function output: FunctionOutput(name='multiply', input=Function(thought=None, name='multiply', args=(), kwargs={'a': 2, 'b': 'x'}), parsed_input=None, output='xx', error=None) + + 5 Query: divide 2 by 3 + -------------------------------------------------- + Function: Function(thought=None, name='divide', args=[], kwargs={'a': 2.0, 'b': 3.0}) + Function output: FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 2.0, 'b': 3.0}), parsed_input=None, output=0.6666666666666666, error=None) + + 6 Query: Add 5 to variable y + -------------------------------------------------- + Function: Function(thought=None, name='add', args=[], kwargs={'a': 5, 'b': 'y'}) + Error at calling : unsupported operand type(s) for +: 'int' and 'str' + Function output: FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 5, 'b': 'y'}), parsed_input=None, output=None, error="unsupported operand type(s) for +: 'int' and 'str'") + + +.. note:: + If users prefer to use Function, to incress the success rate, make sure your function arguments are dict based for class object. You can always convert it to a class from a dict. + + +FunctionExpression Output Format +************************************************** +We will adapt the above code easily using tool manager to use ``FunctionExpression`` as the output format. +We will use FunctionExpression this time in the parser. And we added the necessary context to handle the local variable `x`, `y`, and `np.array`. + +.. code-block:: python + + tool_manager = ToolManager( + tools=functions, + additional_context={"x": x, "y": 0, "np.array": np.array, "np": np}, + ) + func_parser = JsonOutputParser(data_class=FunctionExpression) + +Additionally, we can also pass the ``additional_context`` to LLM using the follow prompt after the + +.. code-block:: python + + context = r""" + Your function expression also have access to these context: + {{context_str}} + + """ + +This time, let us try to execute all function concurrently and treating them all as async functions. + +.. code-block:: python + + async def run_async_function_call(self, generator, tool_manager): + answers = [] + start_time = time.time() + tasks = [] + for idx, query in enumerate(queries): + tasks.append(self.process_query(idx, query, generator, tool_manager)) + + results = await asyncio.gather(*tasks) + answers.extend(results) + end_time = time.time() + print(f"Total time taken: {end_time - start_time :.2f} seconds") + return answers + + async def process_query(self, idx, query, generator, tool_manager: ToolManager): + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs={"input_str": query}) + func_expr = FunctionExpression.from_dict(result.data) + print(f"Function_expr: {func_expr}") + func = tool_manager.parse_func_expr(func_expr) + func_output = await tool_manager.execute_func(func) + print(f"Function output: {func_output}") + return func_output + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + return None + +In this case, we used :meth:`core.tool_manager.ToolManager.parse_func_expr` and :meth:`core.tool_manager.ToolManager.execute_func` to execute the function. +Or we can directly use :meth:`core.tool_manager.ToolManager.execute_func_expr` to execute the function expression. Both are equivalent. + +From the output shown below, this time we get all function calls executed successfully. + +.. code-block:: + + 0 Query: add 2 and 3 + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='add(a=2, b=3)') + + 1 Query: search for something + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='search(query="something")') + + 2 Query: add points (1, 2) and (3, 4) + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))') + + 3 Query: sum numpy array with arr = np.array([[1, 2], [3, 4]]) + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='numpy_sum(arr=np.array([[1, 2], [3, 4]]))') + + 4 Query: multiply 2 with local variable x + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='multiply(a=2, b=2)') + + 5 Query: divide 2 by 3 + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='divide(a=2.0, b=3.0)') + + 6 Query: Add 5 to variable y + -------------------------------------------------- + Function_expr: FunctionExpression(thought=None, action='add(a=0, b=5)') + Function output: FunctionOutput(name='add_points', input=Function(thought=None, name='add_points', args=(), kwargs={'p1': Point(x=1, y=2), 'p2': Point(x=3, y=4)}), parsed_input=None, output=Point(x=4, y=6), error=None) + Function output: FunctionOutput(name='numpy_sum', input=Function(thought=None, name='numpy_sum', args=(), kwargs={'arr': array([[1, 2], + [3, 4]])}), parsed_input=None, output=10, error=None) + Function output: FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 2, 'b': 3}), parsed_input=None, output=5, error=None) + Function output: FunctionOutput(name='multiply', input=Function(thought=None, name='multiply', args=(), kwargs={'a': 2, 'b': 2}), parsed_input=None, output=4, error=None) + Function output: FunctionOutput(name='search', input=Function(thought=None, name='search', args=(), kwargs={'query': 'something'}), parsed_input=None, output=['result1something', 'result2something'], error=None) + Function output: FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 2.0, 'b': 3.0}), parsed_input=None, output=0.6666666666666666, error=None) + Function output: FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 0, 'b': 5}), parsed_input=None, output=5, error=None) + + +Parallel Function Calls +------------------------- + +We will slightly adapt the output format instruction to get it output json array, which can still be parsed with a json parser. + +.. code-block:: python + + multple_function_call_template = r"""You have these tools available: + {% if tools %} + + {% for tool in tools %} + {{ loop.index }}. + {{tool}} + ------------------------ + {% endfor %} + + {% endif %} + + Here is how you call one function. + {{output_format_str}} + -Always return a List using `[]` of the above JSON objects, even if its just one item. + + + {{input_str}} + You: + """ + +As LLM has problem calling ``add_point``, we will add one example and we will generate it with :meth:`core.types.FunctionExpression.from_function`. +We will update our outputparser to use the example: + +.. code-block:: python + + example = FunctionExpression.from_function( + func=add_points, p1=Point(x=1, y=2), p2=Point(x=3, y=4) + ) + func_parser = JsonOutputParser( + data_class=FunctionExpression, examples=[example] + ) + +Here is the updated output format in the prompt: + +.. code-block:: + + + Here is how you call one function. + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "action": "FuncName() Valid function call expression. Example: \"FuncName(a=1, b=2)\" Follow the data type specified in the function parameters. e.g. for Type object with x,y properties, use \"ObjectType(x=1, y=2) (str) (required)" + } + ``` + Here is an example: + ``` + { + "action": "add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))" + } + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions. + Awlays return a List using `[]` of the above JSON objects. You can have length of 1 or more. + Do not call multiple functions in one action field. + + +This case, we will show the response from using `execute_func_expr_via_sandbox` to execute the function expression. + +.. code-block:: python + + for idx in range(0, len(queries), 2): + query = " and ".join(queries[idx : idx + 2]) + prompt_kwargs = {"input_str": query} + print(f"\n{idx} Query: {query}") + print(f"{'-'*50}") + try: + result = generator(prompt_kwargs=prompt_kwargs) + # print(f"LLM raw output: {result.raw_response}") + func_expr: List[FunctionExpression] = [ + FunctionExpression.from_dict(item) for item in result.data + ] + print(f"Function_expr: {func_expr}") + for expr in func_expr: + func_output = tool_manager.execute_func_expr_via_sandbox(expr) + print(f"Function output: {func_output}") + except Exception as e: + print( + f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}" + ) + +By using an example to help with calling ``add_point``, we can now successfully execute all function calls. + +.. code-block:: python + + 0 Query: add 2 and 3 and search for something + -------------------------------------------------- + Function_expr: [FunctionExpression(thought=None, action='add(a=2, b=3)'), FunctionExpression(thought=None, action='search(query="something")')] + Function output: FunctionOutput(name='add(a=2, b=3)', input=FunctionExpression(thought=None, action='add(a=2, b=3)'), parsed_input=None, output=FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 2, 'b': 3}), parsed_input=None, output=5, error=None), error=None) + Function output: FunctionOutput(name='search(query="something")', input=FunctionExpression(thought=None, action='search(query="something")'), parsed_input=None, output=FunctionOutput(name='search', input=Function(thought=None, name='search', args=(), kwargs={'query': 'something'}), parsed_input=None, output=['result1something', 'result2something'], error=None), error=None) + + 2 Query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]]) + -------------------------------------------------- + Function_expr: [FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'), FunctionExpression(thought=None, action='numpy_sum(arr=[[1, 2], [3, 4]])')] + Function output: FunctionOutput(name='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))', input=FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))'), parsed_input=None, output=FunctionOutput(name='add_points', input=Function(thought=None, name='add_points', args=(), kwargs={'p1': Point(x=1, y=2), 'p2': Point(x=3, y=4)}), parsed_input=None, output=Point(x=4, y=6), error=None), error=None) + Function output: FunctionOutput(name='numpy_sum(arr=[[1, 2], [3, 4]])', input=FunctionExpression(thought=None, action='numpy_sum(arr=[[1, 2], [3, 4]])'), parsed_input=None, output=FunctionOutput(name='numpy_sum', input=Function(thought=None, name='numpy_sum', args=(), kwargs={'arr': [[1, 2], [3, 4]]}), parsed_input=None, output=10, error=None), error=None) + + 4 Query: multiply 2 with local variable x and divide 2 by 3 + -------------------------------------------------- + Function_expr: [FunctionExpression(thought=None, action='multiply(a=2, b=x)'), FunctionExpression(thought=None, action='divide(a=2.0, b=3.0)')] + Function output: FunctionOutput(name='multiply(a=2, b=x)', input=FunctionExpression(thought=None, action='multiply(a=2, b=x)'), parsed_input=None, output=FunctionOutput(name='multiply', input=Function(thought=None, name='multiply', args=(), kwargs={'a': 2, 'b': 2}), parsed_input=None, output=4, error=None), error=None) + Function output: FunctionOutput(name='divide(a=2.0, b=3.0)', input=FunctionExpression(thought=None, action='divide(a=2.0, b=3.0)'), parsed_input=None, output=FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 2.0, 'b': 3.0}), parsed_input=None, output=0.6666666666666666, error=None), error=None) + + 6 Query: Add 5 to variable y + -------------------------------------------------- + Function_expr: [FunctionExpression(thought=None, action='add(a=y, b=5)')] + Function output: FunctionOutput(name='add(a=y, b=5)', input=FunctionExpression(thought=None, action='add(a=y, b=5)'), parsed_input=None, output=FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 0, 'b': 5}), parsed_input=None, output=5, error=None), error=None) + +.. admonition:: References + :class: highlight + + 1. OpenAI tools API: https://beta.openai.com/docs/api-reference/tools + +.. admonition:: API References + :class: highlight + + - :class:`core.types.FunctionDefinition` + - :class:`core.types.Function` + - :class:`core.types.FunctionExpression` + - :class:`core.types.FunctionOutput` + - :class:`core.func_tool.FunctionTool` + - :class:`core.tool_manager.ToolManager` + - :func:`core.functional.get_fun_schema` + - :func:`core.functional.parse_function_call_expr` + - :func:`core.functional.sandbox_execute` + - :func:`core.functional.generate_function_call_expression_from_callable` diff --git a/docs/source/get_started/community.rst b/docs/source/get_started/community.rst index b5e1c23e..2f529eb6 100644 --- a/docs/source/get_started/community.rst +++ b/docs/source/get_started/community.rst @@ -1,11 +1,19 @@ Community ============ -Learn, share and collaborate with the LightRAG AI community +**Learn, share and collaborate with the LightRAG AI community** +You can join our community on various platforms: -Discord +* `Discord `_ +* `GitHub Discussion `_ -Github Discussion +.. _discord-link: -.. blogs \ No newline at end of file +Join our **Discord** to engage in real-time conversations with other members. + +.. _github-link: + +Participate in **GitHub Discussions** to share ideas, ask questions, and collaborate on projects. + +.. blogs diff --git a/docs/source/get_started/index.rst b/docs/source/get_started/index.rst index 4e0ffa41..c973024d 100644 --- a/docs/source/get_started/index.rst +++ b/docs/source/get_started/index.rst @@ -8,6 +8,6 @@ Here is the content of our documentation project. .. toctree:: :maxdepth: 2 - lightrag_in_10_mins installation community + lightrag_in_10_mins diff --git a/docs/source/get_started/installation.rst b/docs/source/get_started/installation.rst index 5f35ccef..a0ddb9c4 100644 --- a/docs/source/get_started/installation.rst +++ b/docs/source/get_started/installation.rst @@ -1,16 +1,117 @@ Installation ============ -[Xiaoyi] +LightRAG is available in Python. -To start with LightRAG, please follow the steps: +1. Install LightRAG +~~~~~~~~~~~~~~~~~~~~ -1. Clone the repository. +To install the package, run: -2. Setup API keys by make a copy of ``.env.example`` to ``.env`` and fill in the necessary API keys. +.. code-block:: bash -3. Setup the Python environment using ``poetry install``. And activate the environment using ``poetry shell``. + pip install lightrag -4. (For contributors only) Install pre-commit into your git hooks using ``pre-commit install``, which will automatically check the code standard on every commit. -5. Now you should be able to run any file in the repo. + +2. Set up API keys +~~~~~~~~~~~~~~~~~~~ + +A ``.env`` file is recommended. +You can have it at your project root directory. +Here is an example: + + + +.. code-block:: bash + + OPENAI_API_KEY=YOUR_API_KEY_IF_YOU_USE_OPENAI + GROQ_API_KEY=YOUR_API_KEY_IF_YOU_USE_GROQ + ANTHROPIC_API_KEY=YOUR_API_KEY_IF_YOU_USE_ANTHROPIC + GOOGLE_API_KEY=YOUR_API_KEY_IF_YOU_USE_GOOGLE + COHERE_API_KEY=YOUR_API_KEY_IF_YOU_USE_COHERE + HF_TOKEN=YOUR_API_KEY_IF_YOU_USE_HF + + +3. Load environment variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can use the following import: + +.. code-block:: python + + from lightrag.utils import setup_env() + + setup_env() + +Or, you can load it yourself with ``python-dotenv``: + +.. code-block:: python + + from dotenv import load_dotenv + load_dotenv() # This loads the environment variables from `.env`. + +You can place the above code in your project's root ``__init__.py`` file. +This setup ensures that LightRAG can access all necessary configurations during runtime. + +1. Install Optional Packages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +LightRAG currently has built-in support for (1) OpenAI, Groq, Anthropic, Google, and Cohere, and (2) FAISS and Transformers. +You can find all optional packages at :class:`utils.lazy_import.OptionalPackages`. +Make sure to install the necessary SDKs for the components you plan to use. +Here is the list of our tested versions: + + +.. code-block:: + + openai = "^1.12.0" + groq = "^0.5.0" + faiss-cpu = "^1.8.0" + sqlalchemy = "^2.0.30" + cohere = "^5.5.8" + pgvector = "^0.2.5" + anthropic = "^0.26.0" + google-generativeai = "^0.5.4" + + + + + + +.. Poetry Installation +.. -------------------------- + +.. Developers and contributors who need access to the source code or wish to contribute to the project should set up their environment as follows: + +.. 1. **Clone the Repository:** + +.. Start by cloning the LightRAG repository to your local machine: + +.. .. code-block:: bash + +.. git clone https://github.com/SylphAI-Inc/LightRAG +.. cd LightRAG + +.. 2. **Configure API Keys:** + +.. Copy the example environment file and add your API keys: + +.. .. code-block:: bash + +.. cp .env.example .env +.. # Open .env and fill in your API keys + +.. 3. **Install Dependencies:** + +.. Use Poetry to install the dependencies and set up the virtual environment: + +.. .. code-block:: bash + +.. poetry install +.. poetry shell + +.. 4. **Verification:** + +.. Now, you should be able to run any file within the repository or execute tests to confirm everything is set up correctly. diff --git a/docs/source/index.rst b/docs/source/index.rst index b23ce133..a883edb0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,17 +1,13 @@ -.. LightRAG documentation master file, created by - sphinx-quickstart on Thu May 9 15:45:29 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. ======================= -LightRAG Home +Introduction ======================= +LightRAG is the `PyTorch` library for building large language model (LLM) applications. We help developers with both building and optimizing `Retriever`-`Agent`-`Generator` (RAG) pipelines. +It is light, modular, and robust. -LightRAG is the "PyTorch" library for building large langage model(LLM) applications. It is super light, modular and robust like "PyTorch", and offers essential components for `Retriever`-`Agent`-`Generator` (RAG). -You have a similar coding experience as PyTorch. Here is a side to side comparison of writing a PyTorch module and a LightRAG component: -.. grid:: 2 +.. grid:: 1 :gutter: 1 .. grid-item-card:: PyTorch @@ -43,84 +39,109 @@ You have a similar coding experience as PyTorch. Here is a side to side comparis .. code-block:: python - from core.component import Component, Generator - from components.model_client import OpenAIClient + from lightrag.core import Component, Generator + from lightrag.components.model_client import GroqAPIClient + from lightrag.utils import setup_env #noqa + class SimpleQA(Component): def __init__(self): super().__init__() + template = r""" + You are a helpful assistant. + + User: {{input_str}} + You: + """ self.generator = Generator( - model_client=OpenAIClient(), - model_kwargs={'model': 'gpt-3.5-turbo'} + model_client=GroqAPIClient(), + model_kwargs={"model": "llama3-8b-8192"}, + template=template, ) def call(self, query): - return self.generator.call({'input_str': query}) + return self.generator({"input_str": query}) async def acall(self, query): - return await self.generator.acall({'input_str': query}) - - qa = SimpleQA() - print(qa) - + return await self.generator.acall({"input_str": query}) -**Why LightRAG?** -1. **Clarity and Simplicity** +.. and Customizability - We understand that developers building real-world Large Language Model (LLM) applications are the real heroes. Just like AI researchers and engineers who build models on top of PyTorch, developers require **Maximum Flexibility and Customizability**: Each developer has unique data needs to build their own models/components, experiment with In-context Learning (ICL) or model finetuning, and deploy the LLM applications to production. This means the library must provide fundamental lower-level building blocks and strive for clarity and simplicity: - - We maintain no more than two levels of subclasses. - - Each core abstract class is designed to be robust and flexible. - - We use 10X less code than other libraries to achieve 10X more robustness and flexibility. +Simplicity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Developers who are building real-world Large Language Model (LLM) applications are the real heroes. +As a library, we provide them with the fundamental building blocks with 100% clarity and simplicity. -2. **Control and Transparency** +- Two fundamental and powerful base classes: `Component` for the pipeline and `DataClass` for data interaction with LLMs. +- We end up with less than two levels of subclasses. :doc:`developer_notes/class_hierarchy`. +- The result is a library with bare minimum abstraction, providing developers with maximum customizability. - Coming from a deep AI research background, we understand that the more control and transparency developers have over their prompts, the better. In default: +.. - We use 10X less code than other libraries to achieve 10X more robustness and flexibility. - - LightRAG simplifies what developers need to send to LLM proprietary APIs to just two messages each time: a `system message` and a `user message`. This minimizes reliance on and manipulation by API providers. - - LightRAG provides advanced tooling for developers to build `agents`, `tools/function calls`, etc., without relying on any proprietary API provider's 'advanced' features such as `OpenAI` assistant, tools, and JSON format. +.. - `Class Hierarchy Visualization `_ +.. We support them with require **Maximum Flexibility and Customizability**: -3. **Suitted for Both Researchers and Production Engineers** +.. Each developer has unique data needs to build their own models/components, experiment with In-context Learning (ICL) or model finetuning, and deploy the LLM applications to production. This means the library must provide fundamental lower-level building blocks and strive for clarity and simplicity: - On top of the easiness to use, we in particular optimize the configurability of components for researchers to build their solutions and to benchmark existing solutions. - Like how PyTorch has united both researchers and production teams, it enables smooth transition from research to production. - With researchers building on LightRAG, production engineers can easily take over the method and test and iterate on their production data. - Researchers will want their code to be adapted into more products too. +Similar to the `PyTorch` module, our ``Component`` provides excellent visualization of the pipeline structure. +.. code-block:: + SimpleQA( + (generator): Generator( + model_kwargs={'model': 'llama3-8b-8192'}, + (prompt): Prompt( + template: + You are a helpful assistant. + + User: {{input_str}} + You: + , prompt_variables: ['input_str'] + ) + (model_client): GroqAPIClient() + ) + ) -**LightRAG vs other LLM libraries:** +.. and Robustness +Controllability +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Our simplicity did not come from doing 'less'. +On the contrary, we have to do 'more' and go 'deeper' and 'wider' on any topic to offer developers maximum control and robustness. -**LightRAG library structures as follows:** +- LLMs are sensitive to the prompt. We allow developers full control over their prompts without relying on API features such as tools and JSON format with components like ``Prompt``, ``OutputParser``, ``FunctionTool``, and ``ToolManager``. +- Our goal is not to optimize for integration, but to provide a robust abstraction with representative examples. See this in ``ModelClient`` and ``Retriever``. +- All integrations, such as different API SDKs, are formed as optional packages but all within the same library. You can easily switch to any models from different providers that we officially support. -#TODO: One diagram to make people understand lightrag faster -* `core` - Base abstractions, core functions, and core components like `Generator` and `Embedder` to support more advanced components. -* `components` - Components that are built on top of the core directive. Users will install relevant depencides on their own for some components. +.. Coming from a deep AI research background, we understand that the more control and transparency developers have over their prompts, the better. In default: -**LightRAG documentation is divided into two parts:** +.. - LightRAG simplifies what developers need to send to LLM proprietary APIs to just two messages each time: a `system message` and a `user message`. This minimizes reliance on and manipulation by API providers. -* **Developer Documentation**: This documentation explains how LightRAG is designed in more depth and is especially useful for developers who want to contribute to LightRAG. +.. - LightRAG provides advanced tooling for developers to build `agents`, `tools/function calls`, etc., without relying on any proprietary API provider's 'advanced' features such as `OpenAI` assistant, tools, and JSON format -* **User Documentation**: This documentation is for users who want to use LightRAG to build their applications. +.. It is the future of LLM applications -We encourage all users to at least skim through the developer documentation. Different from "PyTorch" where a normal user does not have to customize a building module for neural network, -LLM applications have much bigger scope and varies even more to different product environments, so developers customizing components on their own is much more common. +Unites both Research and Production +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +On top of the easiness to use, we in particular optimize the configurability of components for researchers to build their solutions and to benchmark existing solutions. +Like how PyTorch has united both researchers and production teams, it enables smooth transition from research to production. +With researchers building on LightRAG, production engineers can easily take over the method and test and iterate on their production data. +Researchers will want their code to be adapted into more products too. .. toctree:: :glob: :maxdepth: 1 - :caption: New Users - + :hidden: get_started/index @@ -130,35 +151,32 @@ LLM applications have much bigger scope and varies even more to different produc .. toctree:: :glob: :maxdepth: 1 - :caption: Tutorials - How each part works + :hidden: developer_notes/index - - - - + .. :caption: Tutorials - How each part works + .. :hidden: .. toctree:: :maxdepth: 1 :caption: Use Cases - How different parts are used to build various LLM applications + :hidden: tutorials/index .. toctree:: + :glob: :maxdepth: 1 - :caption: API Reference + :hidden: apis/index -.. todo:: - .. toctree:: - :maxdepth: 1 - :caption: Benchmarks + .. :caption: Benchmarks - Manually add documents for the code in benchmarks + .. Manually add documents for the code in benchmarks .. :glob: @@ -172,5 +190,6 @@ LLM applications have much bigger scope and varies even more to different produc :glob: :maxdepth: 1 :caption: For Contributors + :hidden: contributor/index diff --git a/docs/source/insert_labels.py b/docs/source/insert_labels.py index bdf5f036..324c10af 100644 --- a/docs/source/insert_labels.py +++ b/docs/source/insert_labels.py @@ -4,17 +4,21 @@ def add_reference_labels(directory: str): try: for filename in os.listdir(directory): - if filename.endswith(".rst") and "index" not in filename: + if filename.endswith(".rst"): + if filename == "index.rst": + module_label = "-".join(directory.split("/")[-2:]) + else: + module_label = filename.replace(".rst", "").replace(".", "-") filepath = os.path.join(directory, filename) with open(filepath, "r+") as file: content = file.read() file.seek(0, 0) - module_label = filename.replace(".rst", "").replace(".", "-") + # module_label = filename.replace(".rst", "").replace(".", "-") if module_label not in content: label_line = f".. _{module_label}:\n\n" file.write(label_line + content) - except: - print(f"directory {directory} not exists") + except Exception as e: + print(f"directory {directory} not exists: {e}") if __name__ == "__main__": diff --git a/docs/source/remove_files.py b/docs/source/remove_files.py index b89abdb2..9cd01c4d 100644 --- a/docs/source/remove_files.py +++ b/docs/source/remove_files.py @@ -18,8 +18,8 @@ def remove_file(directory: str): for filename in os.listdir(directory): module_file = os.path.join(directory, "modules.rst") os.remove(module_file) - except: - print(f"No files to remove in {directory}") + except Exception: + print(f"No modules.rst to remove in {directory}") # remove components.rst, core.rst, prompt.rst, ... try: @@ -27,8 +27,8 @@ def remove_file(directory: str): name = directory.split("/")[-1] + ".rst" module_file = os.path.join(directory, name) os.remove(module_file) - except: - print(f"No files to remove in {directory}") + except Exception: + print(f"No {name} to remove in {directory}") # remove api files to avoid showing duplicated section @@ -41,11 +41,11 @@ def remove_file(directory: str): "components.model_client.google_client.rst", "components.model_client.transformers_client.rst", "components.retriever.llm_retriever.rst", - "components.agent.react_agent.rst", + "components.agent.react.rst", "components.model_client.anthropic_client.rst", "components.output_parsers.outputs.rst", "components.model_client.cohere_client.rst", - "components.retriever.reranker_retriever.rst" + "components.retriever.reranker_retriever.rst", ] try: for filename in os.listdir(directory): @@ -53,8 +53,8 @@ def remove_file(directory: str): filepath = os.path.join(directory, filename) os.remove(filepath) print(f"{filepath} is removed") - except: - print(f"{filepath} not existing") + except Exception: + print(f"No target files to remove in {directory}") remove_file("./source/apis/components") diff --git a/images/LightRAG-logo-circle.png b/images/LightRAG-logo-circle.png deleted file mode 100644 index 899dba0c..00000000 Binary files a/images/LightRAG-logo-circle.png and /dev/null differ diff --git a/images/LightRAG-logo-doc.jpeg b/images/LightRAG-logo-doc.jpeg deleted file mode 100644 index f8c64904..00000000 Binary files a/images/LightRAG-logo-doc.jpeg and /dev/null differ diff --git a/images/LightRAG-logo.jpg b/images/LightRAG-logo.jpg deleted file mode 100644 index 13ec4b56..00000000 Binary files a/images/LightRAG-logo.jpg and /dev/null differ diff --git a/images/LightRAG_dataflow.png b/images/LightRAG_dataflow.png deleted file mode 100644 index b385553c..00000000 Binary files a/images/LightRAG_dataflow.png and /dev/null differ diff --git a/images/lightrag_structure.png b/images/lightrag_structure.png deleted file mode 100644 index cf40969b..00000000 Binary files a/images/lightrag_structure.png and /dev/null differ diff --git a/lightrag/README.md b/lightrag/README.md index c3e91e75..4321213a 100644 --- a/lightrag/README.md +++ b/lightrag/README.md @@ -1,4 +1,4 @@ -# LightRAG + diff --git a/lightrag/bm25_index.json b/lightrag/bm25_index.json deleted file mode 100644 index 2e5bfe2b..00000000 --- a/lightrag/bm25_index.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "type": "InMemoryBM25Retriever", - "data": { - "k1": 1.5, - "b": 0.75, - "epsilon": 0.25, - "top_k": 1, - "indexed": true, - "t2d": [ - { - "hello": 1, - "world": 1 - }, - { - "beautiful": 1, - "is": 1, - "world": 1 - }, - { - "a": 1, - "day": 1, - "good": 1, - "is": 1, - "today": 1 - } - ], - "nd": { - "a": 1, - "beautiful": 1, - "day": 1, - "good": 1, - "hello": 1, - "is": 2, - "today": 1, - "world": 2 - }, - "idf": { - "a": 0.5108256237659907, - "beautiful": 0.5108256237659907, - "day": 0.5108256237659907, - "good": 0.5108256237659907, - "hello": 0.5108256237659907, - "is": 0.06385320297074884, - "today": 0.5108256237659907, - "world": 0.06385320297074884 - }, - "doc_len": [ - 2, - 3, - 5 - ], - "avgdl": 3.3333333333333335, - "corpus_size": 3 - } -} \ No newline at end of file diff --git a/lightrag/components/agent/README.md b/lightrag/components/agent/README.md deleted file mode 100644 index 1b29d07f..00000000 --- a/lightrag/components/agent/README.md +++ /dev/null @@ -1,29 +0,0 @@ -Agent is not a model or LLM model. - -Agent is better defined as a system that uses LLM models to plan and replan steps that each involves the usage of various tools, -such as function calls, another LLM model based on the context and history (memory) to complete a task autonomously. - -The future: the agent can write your prompts too. Check out dspy: https://github.com/stanfordnlp/dspy - -In this directory, we add the general design patterns of agent, here are four (Thanks to Andrew Ng): - -1️⃣ Reflection - -- Self-Refine: Iterative Refinement with Self-Feedback -- Reflexion: Language Agents with Verbal Reinforcement Learning - -2️⃣ Tool use - -- Gorilla: Large Language Model Connected with Massive APIs -- MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action - -3️⃣ Planning - -- Chain-of-Thought Prompting Elicits Reasoning in Large Language Models -- HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face -- React - -4️⃣ Multi-agent collaboration - -- Communicative Agents for Software Development -- AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation diff --git a/lightrag/components/agent/__init__.py b/lightrag/components/agent/__init__.py deleted file mode 100644 index c19297b5..00000000 --- a/lightrag/components/agent/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .react_agent import * diff --git a/lightrag/components/agent/react_agent.py b/lightrag/components/agent/react_agent.py deleted file mode 100644 index fc25a1db..00000000 --- a/lightrag/components/agent/react_agent.py +++ /dev/null @@ -1,364 +0,0 @@ -""" -ReAct Agent leveraging LLM reasoning and function calling. - -Source: https://arxiv.org/abs/2210.03629, published in Mar, 2023 - -Agent is not a model or LLM model. -Agent is better defined as a system that uses LLM models to plan and replan steps that each involves the usage of various tools, -such as function calls, another LLM model based on the context and history (memory) to complete a task autonomously. - -The future: the agent can write your prompts too. Check out dspy: https://github.com/stanfordnlp/dspy - -ReAct agent can be useful for -- Multi-hop reasoning [Q&A], including dividing the query into subqueries and answering them one by one. -- Plan the usage of the given tools: highly flexible. Retriever, Generator modules or any other functions can all be wrapped as tools. - -The initial ReAct paper does not support different types of tools. We have greatly extended the flexibility of tool adaption, even including an llm tool -to answer questions that cant be answered or better be answered by llm using its world knowledge. -- Every react agent can be given a different tasks, different tools, and different LLM models to complete the task. -- 'finish' tool is defined to finish the task by joining all subqueries answers. -""" - -from typing import List, Union, Callable, Optional, Any, Dict -from dataclasses import dataclass -from copy import deepcopy -import logging - -from lightrag.core.generator import Generator -from lightrag.core.component import Component -from lightrag.core.tool_helper import FunctionTool, AsyncCallable -from lightrag.core.string_parser import JsonParser, parse_function_call -from lightrag.core.generator import GeneratorOutput -from lightrag.core.model_client import ModelClient -from lightrag.utils.logger import printc - -log = logging.getLogger(__name__) - -DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r""" -{# role/task description #} -You task is to answer user's query with minimum steps and maximum accuracy using the tools provided. -{# REACT instructions #} -Each step you will read the previous Thought, Action, and Observation(execution result of the action)steps and then provide the next Thought and Action. - -You only have access to the following tools: -{# tools #} -{% for tool in tools %} -{{ loop.index }}. ToolName: {{ tool.metadata.name }} - Tool Description: {{ tool.metadata.description }} - Tool Parameters: {{ tool.metadata.fn_schema_str }} {#tool args can be misleading, especially if we already have type hints and docstring in the function#} -{% endfor %} -{# output is always more robust to use json than string #} ---- -Your output must be in valid JSON format(raw Python string format) with two keys: -{ - "thought": "", - "action": "ToolName(, )" -} -- Must double quote the JSON str. -- Inside of the JSON str, Must use escape double quote and escape backslash for string. -For example: -"action": "finish(\"John's.\")" ---- -{# Specifications TODO: preference between the usage of llm tool vs the other tool #} -Process: -- Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery. -- Call one available tool at a time to solve each subquery/subquestion. \ -- At step 'finish', join all subqueries answers and finish the task. -Remember: -- Action must call one of the above tools with Took Name. It can not be empty. -- Read the Tool Description and ensure your args and kwarg follow what each tool expects in types. e.g. (a=1, b=2) if it is keyword argument or (1, 2) if it is positional. -- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message. -- When the initial query is simple, use minimum steps to answer the query. -{#Examples can be here#} -{# Check if there are any examples #} -{% if examples %} - -{% for example in examples %} -{{ example }} -{% endfor %} - -{% endif %} -<> ------------------ -{# History #} -{% for history in step_history %} -Step {{history.step}}: -{ - "thought": "{{history.thought}}", - "action": "{{history.action}}", -} -"observation": "{{history.observation}}" -{% endfor %} -{% if input_str %} -User query: -{{ input_str }} -{% endif %} -""" - - -# TODO: add better logging @xiaoyi -@dataclass -class StepOutput: - step: int - thought: str - action: str - fun_name: Optional[str] = None # parsed from action - fun_args: Optional[List[Any]] = None # parsed from action - fun_kwargs: Optional[Dict[str, Any]] = None # parsed from action - observation: Optional[str] = ( - None # when step is created, observation is not available, the funtion result - ) - - def __str__(self): - return f"Thought {self.step}: {self.thought}\nAction {self.step}: {self.action}\nObservation {self.step}: {self.observation}" - - -class ReActAgent(Generator): - r""" - ReActAgent is a type of Generator that runs multiple and sequential steps to generate the final response, with DEFAULT_REACT_AGENT_SYSTEM_PROMPT and JsonParser output_processors. - - Users need these arguments to initialize the ReActAgent: - - tools: a list of tools to use to complete the task. Each tool is a function or a function tool. - - max_steps: the maximum number of steps the agent can take to complete the task. - - All other arguments are inherited from Generator such as model_client, model_kwargs, prompt, output_processors, etc. - - There are `examples` which is optional, a list of string examples in the prompt. - - Example: - .. code-block:: python - - from lightrag.core.tool_helper import FunctionTool - from lightrag.components.agent.react_agent import ReActAgent - from lightrag.components.model_client import GroqAPIClient - - import time - import dotenv - # load evironment - dotenv.load_dotenv(dotenv_path=".env", override=True) - - # define the tools - def multiply(a: int, b: int) -> int: - '''Multiply two numbers.''' - return a * b - def add(a: int, b: int) -> int: - '''Add two numbers.''' - return a + b - - tools = [ - FunctionTool.from_defaults(fn=multiply), - FunctionTool.from_defaults(fn=add), - ] - - # set up examples - examples = [ - "your example, a human-like task-solving trajectory" - ] - # preset examples in the prompt - preset_prompt_kwargs = {"example": examples} - - # set up llm args - llm_model_kwargs = { - "model": "llama3-70b-8192", - "temperature": 0.0 - } - - # initialze an agent - agent = ReActAgent( - tools=tools, - model_client=GroqAPIClient(), - model_kwargs=llm_model_kwargs, - max_steps=3, - preset_prompt_kwargs=preset_prompt_kwargs - ) - - # query the agent - queries = ["What is 3 add 4?", "3*9=?"] - average_time = 0 - for query in queries: - t0 = time.time() - answer = agent(query) - - """ - - def __init__( - self, - # added arguments specifc to React - tools: List[Union[Callable, AsyncCallable, FunctionTool]] = [], - max_steps: int = 10, - *, - # the following arguments are inherited from Generator - template: str = DEFAULT_REACT_AGENT_SYSTEM_PROMPT, - preset_prompt_kwargs: Optional[ - Dict - ] = {}, # you can pass examples here, additionally leverage few-shot or many-shots ICL. - output_processors: Optional[Component] = None, - model_client: ModelClient, - model_kwargs: Optional[Dict] = {}, - ): - super().__init__( - template=template, - preset_prompt_kwargs=preset_prompt_kwargs, - output_processors=output_processors, - model_client=model_client, - model_kwargs=model_kwargs, - ) - self.tools = deepcopy(tools) - self.max_steps = max_steps - self.output_processors = output_processors or JsonParser() - - self.additional_llm_tool = Generator( - model_client=model_client, model_kwargs=model_kwargs - ) - - def llm_tool(input: str) -> str: - """ - Answer any input query with llm's world knowledge. Use it as a fallback tool or when the query is simple. - """ - # use the generator to answer the query - prompt_kwargs = {"input_str": input} # wrap the query input in the local prompt_kwargs - try: - response = self.additional_llm_tool.call(prompt_kwargs=prompt_kwargs) - json_response = response.data if isinstance(response, GeneratorOutput) else response # get json data from GeneratorOutput - # print(f"response: {response}, json_response: {json_response}") - return json_response - except Exception as e: - # print(f"Error using the generator: {e}") - log.error(f"Error using the generator: {e}") - - return None - - def finish(answer: str) -> str: - """ - Finish the task by joinging all subqueries answers. - """ - return answer - - self.tools.extend([llm_tool, finish]) - # convert all functions to FunctionTool, and track how to call each function, either call or acall - self.tools = [ - ( - tool - if isinstance(tool, FunctionTool) - else FunctionTool.from_defaults(fn=tool) - ) - for tool in self.tools - ] - # pass the tools to the prompt - self.prompt.update_preset_prompt_kwargs(tools=self.tools) - - self.tools_map = {tool.metadata.name: tool for tool in self.tools} - self.step_history: List[StepOutput] = [] - - def reset(self): - r"""Reset the agent to start a new query.""" - self.step_history = [] - - def _parse_text_response( - self, json_obj_response: Dict[str, Any], step: int - ) -> Optional[StepOutput]: - """ - Parse the json output - """ - try: - thought_key = "thought" - action_key = "action" - thought = json_obj_response.get(thought_key, "") - action = json_obj_response.get(action_key, "") - return StepOutput(step=step, thought=thought, action=action) - except Exception as e: - # print(f"Error parsing response: {e}") - log.error(f"Error parsing response: {e}") - return None - - def _execute_action(self, action_step: StepOutput) -> Optional[StepOutput]: - """ - Parse the action string to a function call and execute it. Update the action_step with the result. - """ - action = action_step.action - try: - fun_name, args, kwargs = parse_function_call(action, self.tools_map) - # print(f"fun_name: {fun_name}, args: {args}, kwargs: {kwargs}") - fun: Union[Callable, AsyncCallable] = self.tools_map[fun_name].fn - result = fun(*args, **kwargs) - action_step.fun_name = fun_name - action_step.fun_args = args - action_step.fun_kwargs = kwargs - - action_step.observation = result - return action_step - except Exception as e: - # print(f"Error executing {action}: {e}") - log.error(f"Error executing {action}: {e}") - # pass the error as observation so that the agent can continue and correct the error in the next step - action_step.observation = f"Error executing {action}: {e}" - return action_step - - def _run_one_step( - self, step: int, prompt_kwargs: Dict, model_kwargs: Dict - ) -> str: - """ - Run one step of the agent. - """ - # step_history is the only per-query variable, and should not be controlled by the user - # add the step_history to the prompt_kwargs - prompt_kwargs["step_history"] = self.step_history - - # call the super class Generator to get the response - response = super().call( - prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs - ) # response is GeneratorOutput - - # get json response data from generator output - json_response = response.data if isinstance(response, GeneratorOutput) else response - - parsed_response = self._parse_text_response( - json_obj_response=json_response, step=step - ) - # execute the action - if parsed_response and parsed_response.action: - parsed_response = self._execute_action(parsed_response) - printc(f"step: {step}, response: {parsed_response}", color="blue") - else: - # print(f"Failed to parse response for step {step}") - log.error(f"Failed to parse response for step {step}") - self.step_history.append(parsed_response) - - return response - - def call( - self, - input: str, - prompt_kwargs: Optional[Dict] = {}, - model_kwargs: Optional[Dict] = {}, - ) -> Any: - r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs.""" - # wrap up the input in the prompt_kwargs - prompt_kwargs["input_str"] = input - printc(f"input_query: {input}", color="cyan") - for i in range(self.max_steps): - step = i + 1 - try: - self._run_one_step(step, prompt_kwargs, model_kwargs) - if ( - self.step_history[-1].fun_name - and self.step_history[-1].fun_name == "finish" - ): - break - except Exception as e: - error_message = f"Error running step {step}: {e}" - # print(error_message) - log.error(error_message) - try: - answer = self.step_history[-1].observation - except: - answer = None - printc(f"answer: {answer}", color="magneta") - # print(f"step_history: {self.step_history}") - log.info(f"step_history: {self.step_history}") - self.reset() - return answer - - def _extra_repr(self) -> str: - s = f"tools={self.tools}, max_steps={self.max_steps}, " - s += super()._extra_repr() - return s diff --git a/lightrag/components/data_process/document_splitter.py b/lightrag/components/data_process/document_splitter.py deleted file mode 100644 index 1a2d4598..00000000 --- a/lightrag/components/data_process/document_splitter.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Text splitter to split long text into smaller chunks to fit into the token limits of embedding and LLM models.""" - -# TODO: JSON/HTML Splitter -from copy import deepcopy -from typing import List, Literal -from tqdm import tqdm - -from more_itertools import windowed - -from lightrag.core.component import Component -from lightrag.core.types import Document -from lightrag.core.tokenizer import Tokenizer - - -DocumentSplitterInputType = List[Document] -DocumentSplitterOutputType = List[Document] - - -def split_text_by_token_fn(x: str, tokenizer: Tokenizer = Tokenizer()) -> List[str]: - x = x.lower() - return tokenizer.get_string_tokens(x) - - -DocumentSplitterInputType = List[Document] -DocumentSplitterOutputType = List[Document] - - -class DocumentSplitter(Component): - __doc__ = r""" - Splits a list of text documents into a list of text documents with shorter texts. - - Output: List[Document] - - Splitting documents with long texts is a common preprocessing step for LLM applications. - The splitted documents are easier to fit into the token limits of language models, both Embedders and Generators, - and to ensure the retrieved context can be more relevant than the large text itself. - - Args: - split_by (str): The unit by which the document should be split. Choose from "word" for splitting by " ", - "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n". - split_length (int): The maximum number of units in each split. It can be number of works, sentences, pages or passages. - split_overlap (int): The number of units that each split should overlap. - - Example: - - .. code-block:: python - - from lightrag.core.document_splitter import DocumentSplitter - from lightrag.core.types import Document - - doc1 = Document(text="This is a test document. It is a long document.") - doc2 = Document(text="This is another test document. It is also a long document.") - splitter = DocumentSplitter(split_by="token", split_length=4, split_overlap=1) - print(splitter) - splitted_docs = splitter([doc1, doc2]) - print(splitted_docs) - """ - - def __init__( - self, - split_by: Literal["word", "token", "sentence", "page", "passage"] = "word", - split_length: int = 200, - split_overlap: int = 0, - ): - super().__init__( - split_by=split_by, split_length=split_length, split_overlap=split_overlap - ) - - self.split_by = split_by - if split_by not in ["word", "sentence", "page", "passage", "token"]: - raise ValueError( - "split_by must be one of 'word', 'sentence', 'page' or 'passage'." - ) - if split_length <= 0: - raise ValueError("split_length must be greater than 0.") - self.split_length = split_length - if split_overlap < 0: - raise ValueError("split_overlap must be greater than or equal to 0.") - self.split_overlap = split_overlap - - def split_text(self, text: str) -> List[str]: - r"""Splits a text into a list of shorter texts.""" - units = self._split_into_units(text, self.split_by) - return self._concatenate_units(units, self.split_length, self.split_overlap) - - def call(self, documents: List[Document]) -> List[Document]: - if not isinstance(documents, list) or ( - documents and not isinstance(documents[0], Document) - ): - raise TypeError("DocumentSplitter expects a List of Documents as input.") - - split_docs: List[Document] = [] - for doc in tqdm(documents, desc="Splitting documents"): - if doc.text is None: - raise ValueError( - f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." - ) - text_splits = self.split_text(doc.text) - meta_data = deepcopy(doc.meta_data) - split_docs += [ - Document( - text=txt, - meta_data=meta_data, - parent_doc_id=f"{doc.id}", - order=i, - vector=[], - ) - for i, txt in enumerate(text_splits) - ] - return split_docs - - def _split_into_units( - self, - text: str, - split_by: Literal["word", "sentence", "passage", "page", "token"], - ) -> List[str]: - if split_by == "token": - units = split_text_by_token_fn(x=text) - print(units) - else: # text splitter - if split_by == "page": - split_at = "\f" - elif split_by == "passage": - split_at = "\n\n" - elif split_by == "sentence": - split_at = "." - elif split_by == "word": - split_at = " " - else: - raise NotImplementedError( - "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options." - ) - units = text.split(split_at) - # Add the delimiter back to all units except the last one - for i in range(len(units) - 1): - units[i] += split_at - return units - - def _concatenate_units( - self, elements: List[str], split_length: int, split_overlap: int - ) -> List[str]: - """Concatenates the elements into parts of split_length units.""" - text_splits = [] - segments = windowed(elements, n=split_length, step=split_length - split_overlap) - for seg in segments: - current_units = [unit for unit in seg if unit is not None] - txt = "".join(current_units) - if len(txt) > 0: - text_splits.append(txt) - return text_splits - - def _extra_repr(self) -> str: - s = f"split_by={self.split_by}, split_length={self.split_length}, split_overlap={self.split_overlap}" - return s - -if __name__ == "__main__": - from lightrag.core.document_splitter import DocumentSplitter - from lightrag.core.types import Document - - doc1 = Document(text="This is a simple test to check splitting.") - # doc2 = Document(text="This is another test document. It is also a long document.") - splitter = DocumentSplitter(split_by="word", split_length=5, split_overlap=2) - # print(splitter) - splitted_docs = splitter([doc1]) - # print(splitted_docs) - for doc in splitted_docs: - print(doc.text) \ No newline at end of file diff --git a/lightrag/components/reasoning/__init__.py b/lightrag/components/reasoning/__init__.py deleted file mode 100644 index 11d0f669..00000000 --- a/lightrag/components/reasoning/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .chain_of_thought import * diff --git a/lightrag/components/reasoning/chain_of_thought.py b/lightrag/components/reasoning/chain_of_thought.py deleted file mode 100644 index 02579a1b..00000000 --- a/lightrag/components/reasoning/chain_of_thought.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -https://arxiv.org/abs/2201.11903, published in Jan, 2023 - -Chain of the thought(CoT) is to mimic a step-by-step thought process for arriving at the answer. You can achieve it in two ways: -1. Add instructions such as "Let's think step-by-step to answer this question". -2. Add few-shot examples such as -' -Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now? -A: Roger started with 5 balls. 2 cansof 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11. -' - -NOTE: CoT can be helpful for more complicated task, it also varies from task to task and model to model. -For instance, CoT might already be supported in gpt3.5+ api calls. - -Benchmark it with and without CoT to see if it helps. -""" - -from typing import Dict, Optional - -from core.component import Component -from core.generator import Generator -from core.string_parser import JsonParser -from core.model_client import ModelClient -from core.default_prompt_template import DEFAULT_LIGHTRAG_SYSTEM_PROMPT - - -COT_TASK_DESC_STR_BASIC = ( - "You are a helpful assistant. Let's think step-by-step to answer user's query." -) -# Using triple quotes to include JSON-like structure more cleanly -COT_TASK_DESC_STR_WITH_JSON_OUTPUT = f""" -{COT_TASK_DESC_STR_BASIC} Output JSON format: {{"thought": "", "answer": ""}} -""" - - -# ChainOfThought will just be a generator with preset_prompt_kwargs of the task_desc_str = COT_TASK_DESC_STR -# additional you can ask it to generate a json with "thought" and "anwer" keys and use jsonParser - - -class CoTGenerator(Generator): - r""" - CoTGenerator is a subclass of Generator with default task_desc_str preset for Chain of Thought. - Output will be string. - It is exactly the same as using a Generator. - Example: - ``` - cot = CoTGenerator(model_client=model_client, model_kwargs={"model": model}) - ``` - """ - - def __init__( - self, - *, - model_client: ModelClient, - model_kwargs: Dict = {}, - template: Optional[str] = None, - preset_prompt_kwargs: Optional[Dict] = None, - output_processors: Optional[Component] = None, - ) -> None: - - super().__init__( - model_client=model_client, - model_kwargs=model_kwargs, - template=template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT, - preset_prompt_kwargs=preset_prompt_kwargs - or {"task_desc_str": COT_TASK_DESC_STR_BASIC}, - output_processors=output_processors, - ) - - -class CoTGeneratorWithJsonOutput(Generator): - r""" - CoTGeneratorWithJsonOutput is a subclass of Generator with default task_desc_str preset for Chain of Thought. - Output will be parsed as JSON with "thought" and "answer" keys. - Example: - ``` - cot = CoTGeneratorWithJsonOutput(model_client=model_client, model_kwargs={"model": model}) - ``` - """ - - def __init__( - self, - *, - model_client: ModelClient, - model_kwargs: Dict = {}, - template: Optional[str] = None, - preset_prompt_kwargs: Optional[Dict] = None, - output_processors: Optional[Component] = None, - ) -> None: - - super().__init__( - model_client=model_client, - model_kwargs=model_kwargs, - template=template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT, - preset_prompt_kwargs=preset_prompt_kwargs - or {"task_desc_str": COT_TASK_DESC_STR_WITH_JSON_OUTPUT}, - output_processors=output_processors or JsonParser(), - ) diff --git a/lightrag/core/base_data_class.py b/lightrag/core/base_data_class.py deleted file mode 100644 index 078f7ae8..00000000 --- a/lightrag/core/base_data_class.py +++ /dev/null @@ -1,687 +0,0 @@ -""" -The role of the base data class in LightRAG for LLM applications is like `Tensor` for `PyTorch`. -""" - -from typing import List, Dict, Any, Optional, TypeVar, Type, Tuple -import enum -from dataclasses import ( - dataclass, - field, - fields, - make_dataclass, - MISSING, - is_dataclass, -) - -import json -import yaml -import warnings -import logging - - -logger = logging.getLogger(__name__) - -T_co = TypeVar("T_co", covariant=True) - - -class DataClassFormatType(enum.Enum): - r"""The format type for the DataClass schema.""" - - # for class - SCHEMA = "schema" - SIGNATURE_YAML = "signature_yaml" - SIGNATURE_JSON = "signature_json" - # for instance - EXAMPLE_YAML = "example_yaml" - EXAMPLE_JSON = "example_json" - - -def required_field(name): - r""" - A patch for `TypeError: non-default argument follows default argument` - - Use default_factory=required_field to make a field required if field before has used default - or default_factory before it. - - With this patch, our dataclass schema will make this a required field in string description. - """ - raise TypeError(f"The '{name}' field is required and was not provided.") - - -def _get_data_class_schema( - data_class: Type, exclude: Optional[List[str]] = None -) -> Dict[str, Dict[str, Any]]: - r"""Helper function to get the schema of a DataClass in type of Dict.""" - - if not is_dataclass(data_class): - raise ValueError("Provided class is not a dataclass") - schema: Dict[str, Dict] = {} - if exclude is None: - exclude = [] - for f in fields(data_class): - field_name = f.name - if field_name in exclude: - continue - - field_info = { - "type": f.type.__name__, - } - # add description if available - if "desc" in f.metadata or "description" in f.metadata: - field_info["desc"] = f.metadata.get("desc", f.metadata.get("description")) - - # Determine if the field is required or optional - # Using __name__ to check for function identity - if f.default is MISSING and ( - f.default_factory is MISSING - or ( - hasattr(f.default_factory, "__name__") - and f.default_factory.__name__ == "required_field" - ) - ): - field_info["required"] = True - else: - field_info["required"] = False - # if f.default is not MISSING: - # field_info["default"] = f.default - # elif f.default_factory is not MISSING: - # field_info["default"] = f.default_factory() - - schema[field_name] = field_info - - return schema - - -def convert_schema_to_signature(schema: Dict[str, Dict[str, Any]]) -> Dict[str, str]: - r"""Convert the value from _get_data_class_schema to a string description.""" - - signature = {} - for field_name, field_info in schema.items(): - field_signature = field_info.get("desc", "") - # add type to the signature - if field_info["type"]: - field_signature += f" ({field_info['type']})" - - if field_info["required"]: - field_signature += " (required)" - else: - field_signature += " (optional)" - signature[field_name] = field_signature - return signature - - -class _DataClassMeta(type): - r"""Internal metaclass for DataClass to ensure both DataClass and its inherited classes are dataclasses. - - Args: - cls: The class object being created. - It will be for base class and - for inherited class for instance. - name: the name of the class - bases: A tuple of the base classes from which the class inherits. - dct: The dictionary of attributes and methods of the class. - """ - - def __init__( - cls: Type[Any], name: str, bases: Tuple[type, ...], dct: Dict[str, Any] - ) -> None: - super(_DataClassMeta, cls).__init__(name, bases, dct) - # __name__ is lightrag.core.base_data_class, will always be the base class - # print("DataClassMeta init, class:", cls) - # print( - # f"cls.__module__ = {cls.__module__}, __name__ = {__name__}, {cls.__module__ != __name__}" - # ) - # print(f"{cls.__module__} is_dataclass(cls) = {is_dataclass(cls)} ") - if ( - not is_dataclass(cls) - and cls.__module__ != __name__ # and bases != (object,) - ): # Avoid decorating DataClass itself. - # print(f"dataclas : {cls}") - dataclass(cls) - - -# TODO: we want the child class to work either with or without dataclass decorator, -# using metaclass with DataClassMeta works if both base and child does not have dataclass decorator -# but if the child has dataclass decorator, it will not work. -# class DataClass(metaclass=_DataClassMeta): -# class OutputDataClass(DataClass): -# before we do more tests, we keep the base and child class manually decorated with dataclass - - -# 1. Support dataclass as field type, the nested dataclass using to_yaml, to_dict, or __dict__. -@dataclass -class DataClass: - __doc__ = r"""The base data class for all data types that interact with LLMs. - - Designed to streamline the handling, serialization, and description of data within our applications, especially to LLM prompt. - We explicitly handle this instead of relying on 3rd party libraries such as pydantic or marshmallow to have better - transparency and to keep the order of the fields when get serialized. - - How to create your own dataclass? - - 1. Subclass DataClass and define the fields with the `field` decorator. - 2. Use the `medata` argument and a `desc` key to describe the field. - 3. Keep the order of the fields as how you want them to be serialized and described to LLMs. - 4. field with default value is considered optional. Field without default value and field with default_factory=required_field is considered required. - - How to use it? - - Describing: - - We defined :ref:`DataClassFormatType ` to categorize DataClass description formats - as input or output in LLM prompt. - - - (1) For describing the class (data structure): - - `Signature` is more token effcient than schema, and schema as it is always a json string, when you want LLMs to output yaml, it can be misleading if you describe the data structure in json. - - - DataClassFormatType.SCHEMA: a more standard way to describe the data structure in Json string, :meth:`to_data_class_schema_str` as string and :meth:`to_data_class_schema` as dict. - - DataClassFormatType.SIGNATURE_JSON: emitating a json object with field name as key and description as value, :meth:`to_json_signature` as string. - - DataClassFormatType.SIGNATURE_YAML: emitating a yaml object with field name as key and description as value, :meth:`to_yaml_signature` as string. - - (2) For describing the class instance: this is helpful to do few-shot examples in LLM prompt. - - DataClassFormatType.EXAMPLE_JSON: the json representation of the instance, :meth:`to_json` as string. - - DataClassFormatType.EXAMPLE_YAML: the yaml representation of the instance, :meth:`to_yaml` as string. - - Overall, we have a unified class method :meth:`format_str` to generate formatted output based on the type of operation and class/instance context. - - note:: - You do not need to use our format, overwrite any method in the subclass to fit in your needs. - - Loading data: - - - :meth:`from_dict` is used to create a dataclass instance from a dictionary. - - - Refer :ref:`DataClass` for more detailed instructions. - - Examples: - - .. code-block:: python - - # Define a dataclass - from lightrag.core import DataClass - - class MyOutputs(DataClass): - age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"}) - name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"}) - - # Create json signature - print(MyOutputs.to_json_signature()) - # Output: - # { - # "age": "The age of the person", - # "name": "The name of the person" - # } - # Create yaml signature - print(MyOutputs.to_yaml_signature()) - # Output: - # age: The age of the person - # name: The name of the person - - # Create a dataclass instance - my_instance = MyOutputs(age=25, name="John Doe") - # Create json example - print(my_instance.to_json_example()) - # Output: - # { - # "age": 25, - # "name": "John Doe" - # } - # Create yaml signature - print(my_instance.to_yaml_example()) - # Output: - # age: 25 - # name: John Doe - - """ - - def __post_init__(self): - # TODO: use desription in the field - for f in fields(self): - if "desc" not in f.metadata: - warnings.warn( - f"Field {f.name} is missing 'desc' in metadata", UserWarning - ) - - def set_field_value(self, field_name: str, value: Any): - r"""Set the value of a field in the dataclass instance.""" - if field_name not in self.__dict__: # check if the field exists - logging.warning(f"Field {field_name} does not exist in the dataclass") - setattr(self, field_name, value) - - def to_dict(self, exclude: Optional[List[str]] = None) -> dict: - """More of an internal method used for serialization. - - Converts the dataclass to a dictionary, optionally excluding specified fields. - Use this to save states of the instance, not advised to use in LLM prompt. - """ - if not is_dataclass(self): - raise ValueError("to_dict() called on a class type, not an instance.") - if exclude is None: - exclude = [] - exclude_set = set(exclude) - - data = { - field.name: getattr(self, field.name) - for field in fields(self) - if field.name not in exclude_set - } - # Recursively convert nested dataclasses - for key, value in data.items(): - if is_dataclass(value): - if hasattr(value, "to_dict"): - data[key] = value.to_dict() - elif hasattr(value, "__dict__"): - data[key] = value.__dict__ - else: - logging.warning( - f"Field {key} is not a dataclass or does not have a to_dict method" - ) - return data - - @classmethod - def from_dict(cls, data: Dict[str, Any]): - r""" - Create a dataclass instance from a dictionary. - """ - # Recursively construct nested dataclasses - field_types = {f.name: f.type for f in fields(cls)} - init_kwargs = {} - for key, value in data.items(): - if key not in field_types: - logging.warning(f"Field {key} does not exist in the dataclass") - continue - field_type = field_types[key] - if is_dataclass(field_type): - if isinstance(value, str): - # Attempt to parse the string as JSON - try: - value = json.loads(value) - except json.JSONDecodeError: - logging.error(f"Error decoding JSON for field {key}") - continue - if hasattr(field_type, "from_dict"): - init_kwargs[key] = field_type.from_dict(value) - elif hasattr(field_type, "__dict__"): - init_kwargs[key] = field_type(**value) - else: - logging.warning( - f"Field {key} is not a dataclass or does not have a from_dict method" - ) - else: - init_kwargs[key] = value - return cls(**init_kwargs) - - @classmethod - def format_class_str( - cls: "DataClass", - format_type: DataClassFormatType, - exclude: Optional[List[str]] = None, - ) -> str: - """Generate formatted output based on the type of operation and class/instance context. - - Args: - format_type (DataClassFormatType): Specifies the format and type (schema, signature, example). - - Returns: - str: A string representing the formatted output. - - Examples: - - .. code-block:: python - - # Define a dataclass - from lightrag.core import DataClass - - """ - assert format_type in [ - DataClassFormatType.SIGNATURE_JSON, - DataClassFormatType.SIGNATURE_YAML, - DataClassFormatType.SCHEMA, - ], "format_class_str is only for class formats" - if not is_dataclass(cls): - raise ValueError(f"{cls.__name__} must be a dataclass to use format_str.") - - # Check the type of format required and whether it's called on an instance or class - if format_type == DataClassFormatType.SIGNATURE_JSON: - return cls.to_json_signature(exclude) - elif format_type == DataClassFormatType.SIGNATURE_YAML: - return cls.to_yaml_signature(exclude) - - elif format_type == DataClassFormatType.SCHEMA: - return cls.to_data_class_schema_str(exclude) - else: - raise ValueError(f"Unsupported format type: {format_type}") - - def format_example_str( - self, format_type: DataClassFormatType, exclude: Optional[List[str]] = None - ) -> str: - """Generate formatted output based on the type of operation and class/instance context. - - Args: - format_type (DataClassFormatType): Specifies the format and type (schema, signature, example). - - Returns: - str: A string representing the formatted output. - - """ - if not is_dataclass(self): - raise ValueError(f"{self.__name__} must be a dataclass to use format_str.") - - assert format_type in [ - DataClassFormatType.EXAMPLE_JSON, - DataClassFormatType.EXAMPLE_YAML, - ], "format_str is only for example formats" - - # Check the type of format required and whether it's called on an instance or class - if format_type == DataClassFormatType.EXAMPLE_JSON: - return self.to_json(exclude) - elif format_type == DataClassFormatType.EXAMPLE_YAML: - return self.to_yaml(exclude) - else: - raise ValueError(f"Unsupported format type: {format_type}") - - @classmethod - def to_data_class_schema( - cls, exclude: Optional[List[str]] = None - ) -> Dict[str, Dict[str, Any]]: - """Generate a Json schema which is more detailed than the signature.""" - return _get_data_class_schema(cls, exclude) - - @classmethod - def to_data_class_schema_str(cls, exclude: Optional[List[str]] = None) -> str: - """Generate a Json schema which is more detailed than the signature.""" - schema = cls.to_data_class_schema(exclude) - return json.dumps(schema, indent=4) - - @classmethod - def to_yaml_signature(cls, exclude: Optional[List[str]] = None) -> str: - r"""Generate a YAML signature for the class from desc in metadata. - - Used mostly as LLM prompt to describe the output data format. - """ - # NOTE: we manually format the yaml string as the yaml.dump will always sort the keys - # Which can impact the final model output - schema = cls.to_data_class_schema(exclude) - signature_dict = convert_schema_to_signature(schema) - yaml_content = [] - for key, value in signature_dict.items(): - yaml_content.append(f"{key}: {value}") - - yaml_output = "\n".join(yaml_content) - return yaml_output - # return yaml.dump(signature_dict, default_flow_style=False) - - @classmethod - def to_json_signature(cls, exclude: Optional[List[str]] = None) -> str: - """Generate a JSON `signature`(json string) for the class from desc in metadata. - - Used mostly as LLM prompt to describe the output data format. - - Example: - - >>> @dataclass - >>> class MyOutputs(DataClass): - >>> age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"}) - >>> name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"}) - - >>> print(MyOutputs.to_json_signature()) - >>> # Output is a JSON string: - >>> # '{ - >>> # "age": "The age of the person (int) (required)", - >>> # "name": "The name of the person (str) (required)" - >>> #}' - """ - schema = cls.to_data_class_schema(exclude) - signature_dict = convert_schema_to_signature(schema) - # # manually format the json string as the json.dump will always sort the keys - # # Which can impact the final model output - # json_content = [] - # for key, value in signature_dict.items(): - # json_content.append(f'"{key}": "{value}"') - - # # Join all parts with commas to form the complete JSON string - # json_output = ",\n".join(json_content) - # # return "{\n" + json_output + "\n}" - return json.dumps(signature_dict, indent=4) - - def to_yaml(self, exclude: Optional[List[str]] = None) -> str: - """ - Convert the dataclass instance to a YAML string. - - Manually formats each field to ensure proper YAML output without unwanted characters. - - You can load it back to yaml object with: - >>> yaml.safe_load(yaml_string) - """ - exclude = exclude or [] - yaml_content = [] - indent_str = " " * 2 - - for f in fields(self): - if f.name and exclude and f.name in exclude: - continue - value = getattr(self, f.name) - # Serialize value to a more controlled YAML format string - if isinstance(value, str): - # Directly format strings to ensure quotes are correctly placed - value_formatted = f'"{value}"' - yaml_content.append(f"{f.name}: {value_formatted}") - elif isinstance(value, (list, dict)): - value_formatted = yaml.dump(value, default_flow_style=False) - yaml_content.append(f"{f.name}: \n{value_formatted}") # same line - # other class, check if they have to_dict method, other wise, use __dict__ - elif ( - hasattr(value, "to_yaml") - or hasattr(value, "to_dict") - or hasattr(value, "__dict__") - ): - if hasattr(value, "to_yaml"): - value_formatted = value.to_yaml() - else: - if hasattr(value, "to_dict"): - value_formatted = yaml.dump( - value.to_dict(), default_flow_style=False - ) - else: - value_formatted = yaml.dump( - value.__dict__, default_flow_style=False - ) - # add indent to each line - value_formatted = indent_str + f"\n{indent_str}".join( - value_formatted.split("\n") - ) - value_formatted = value_formatted.rstrip().rstrip("\n...") - content = f"{f.name}: \n{value_formatted}" - yaml_content.append(content) - else: - # Use yaml.dump for other types but ensure the output is clean - value_formatted = ( - yaml.dump(value, default_flow_style=False).strip().rstrip("\n...") - ) - - yaml_content.append(f"{f.name}: {value_formatted}") - yaml_output = "\n".join(yaml_content) - return yaml_output - - def to_json(self, exclude: Optional[List[str]] = None) -> str: - """ - Convert the dataclass instance to a JSON string. - - Manually formats each field to ensure proper JSON output without unwanted characters. - - You can load it back to json object with: - >>> json.loads(json_string) - """ - exclude = exclude or [] - json_content = {} - for f in fields(self): - if f.name and exclude and f.name in exclude: - continue - value = getattr(self, f.name) - # Serialize each field according to its type - # For strings, integers, floats, booleans, directly assign - # For lists and dicts, use json.dumps to ensure proper formatting - if isinstance(value, (str, int, float, bool)): - json_content[f.name] = value - elif isinstance(value, (list, dict)): - # Convert lists and dictionaries to a string and then parse it back to ensure correct format - json_content[f.name] = json.loads(json.dumps(value)) - # other class, check if they have to_dict method, other wise, use __dict__ - elif ( - hasattr(value, "to_json") - or hasattr(value, "to_dict") - or hasattr(value, "__dict__") - ): - if hasattr(value, "to_json"): - json_content[f.name] = json.loads(value.to_json()) - else: - if hasattr(value, "to_dict"): - json_content[f.name] = value.to_dict() - else: - json_content[f.name] = value.__dict__ - else: - # Fallback for other types if necessary, can be customized further based on needs - json_content[f.name] = str(value) - - # Convert the entire content dictionary to a JSON string - json_output = json.dumps(json_content, indent=4) - return json_output - - @classmethod - def to_dict_class(cls, exclude: Optional[List[str]] = None) -> dict: - """More of an internal used class method for serialization. - - Converts the dataclass to a dictionary, optionally excluding specified fields. - Use this to save states of the class in serialization, not advised to use in LLM prompt. - """ - return cls.to_data_class_schema(exclude) - - -"""Reserved for Agent to automatically create a dataclass and to manipulate the code""" - - -@dataclass -class DynamicDataClassFactory: - __doc__ = r""" - This class is used to create a dynamic dataclass called `DynamicOutputs` from a dictionary. - The dictionary should have the following structure: - { - "field_name": { - "value": field_value, - "desc": "Field description", - "prefix": "Field prefix", - }, - - } - - Examples: - - .. code-block:: python - - data = { - "age": {"value": 30, "desc": "The age of the person", "prefix": "Age:"}, - "name": {"value": "John Doe", "desc": "The name of the person", "prefix": "Name:"}, - } - - DynamicOutputs = DynamicDataClassFactory.create_from_dict(data) - class_instance = DynamicOutputs() - print(class_instance) - - # Output: - # DataClass(age=30, name='John Doe') - """ - - @staticmethod - def create_from_dict(data: dict, base_class=DataClass): - fields_spec = [] - for key, value_dict in data.items(): - field_type = type(value_dict["value"]) - default_value = value_dict["value"] - metadata = { - "desc": value_dict.get("desc", "No description provided"), - "prefix": value_dict.get("prefix", ""), - } - fields_spec.append( - (key, field_type, field(default=default_value, metadata=metadata)) - ) - - dynamic_class = make_dataclass( - "DynamicOutputs", fields_spec, bases=(base_class,) - ) - - return dynamic_class - - -if __name__ == "__main__": - from dataclasses import dataclass - - @dataclass - class Address: - street: str - city: str - postal_code: str - - @dataclass - class Person(DataClass): - name: str - age: int - address: Address - - person = Person( - name="John Doe", - age=30, - address=Address(street="123 Main St", city="Anytown", postal_code="12345"), - ) - print(person.to_yaml()) - yaml_str = person.to_yaml() - print(yaml_str) - print("last char", repr(yaml_str[-2:])) - print(yaml.safe_load(yaml_str)) - restored_person = Person.from_dict(yaml.safe_load(yaml_str)) - print(restored_person) - - # test to_json - print(person.to_json()) - json_str = person.to_json() - print(json_str) - print(json.loads(json_str)) - restored_person = Person.from_dict(json.loads(json_str)) - print(restored_person) - - # now try a list of nested dataclass - @dataclass - class Company(DataClass): - name: str - address: Address - employees: List[int] # employee ids - - company = Company( - name="ACME", - address=Address(street="123 Main St", city="Anytown", postal_code="12345"), - employees=[1, 2, 3], - ) - print(company.to_yaml()) - yaml_str = company.to_yaml() - default_yaml_str = yaml.dump(company, default_flow_style=False) - # load back - restored_company = Company.from_dict(yaml.safe_load(yaml_str)) - print(restored_company) - # print("default yaml:", default_yaml_str) - # print(yaml.safe_load(default_yaml_str)), will fail as it is not a valid yaml string - - json_str = company.to_json() - print(json_str) - restored_company = Company.from_dict(json.loads(json_str)) - print(restored_company) - print("to_dict:", company.to_dict()) - print("to_dict_class:", Company.to_dict_class()) - - print(f"person to dict {person.to_dict()}") - print(f"person to dict default {person.__dict__}") - - # default_json_str = json.dumps(company, indent=4) - # print(default_json_str) - # print(json.loads(default_json_str)) diff --git a/lightrag/core/functional.py b/lightrag/core/functional.py deleted file mode 100644 index ada536e4..00000000 --- a/lightrag/core/functional.py +++ /dev/null @@ -1,256 +0,0 @@ -"""Functional interface. -Core functions we use to build across the components. -Users can leverage these functions to customize their own components.""" - -from typing import Dict, Any, Callable, Union, List, Tuple -import numpy as np -import re -import json - - -def compose_model_kwargs(default_model_kwargs: Dict, model_kwargs: Dict) -> Dict: - r""" - The model configuration exclude the input itself. - Combine the default model, model_kwargs with the passed model_kwargs. - Example: - model_kwargs = {"temperature": 0.5, "model": "gpt-3.5-turbo"} - self.model_kwargs = {"model": "gpt-3.5"} - combine_kwargs(model_kwargs) => {"temperature": 0.5, "model": "gpt-3.5-turbo"} - - """ - pass_model_kwargs = default_model_kwargs.copy() - - if model_kwargs: - pass_model_kwargs.update(model_kwargs) - return pass_model_kwargs - - -VECTOR_TYPE = Union[List[float], np.ndarray] - - -def is_normalized(v: VECTOR_TYPE, tol=1e-4) -> bool: - if isinstance(v, list): - v = np.array(v) - # Compute the norm of the vector (assuming v is 1D) - norm = np.linalg.norm(v) - # Check if the norm is approximately 1 - return np.abs(norm - 1) < tol - - -def normalize_np_array(v: np.ndarray) -> np.ndarray: - # Compute the norm of the vector (assuming v is 1D) - norm = np.linalg.norm(v) - # Normalize the vector - normalized_v = v / norm - # Return the normalized vector - return normalized_v - - -def normalize_vector(v: VECTOR_TYPE) -> List[float]: - if isinstance(v, list): - v = np.array(v) - # Compute the norm of the vector (assuming v is 1D) - norm = np.linalg.norm(v) - # Normalize the vector - normalized_v = v / norm - # Return the normalized vector as a list - return normalized_v.tolist() - - -def get_top_k_indices_scores( - scores: Union[List[float], np.ndarray], top_k: int -) -> Tuple[List[int], List[float]]: - if isinstance(scores, list): - scores_np = np.array(scores) - else: - scores_np = scores - top_k_indices = np.argsort(scores_np)[-top_k:][::-1] - top_k_scores = scores_np[top_k_indices] - return top_k_indices.tolist(), top_k_scores.tolist() - - -def generate_readable_key_for_function(fn: Callable) -> str: - - module_name = fn.__module__ - function_name = fn.__name__ - return f"{module_name}.{function_name}" - - -def extract_json_str(text: str, add_missing_right_brace: bool = True) -> str: - """ - Extract JSON string from text. - NOTE: Only handles the first JSON object found in the text. And it expects at least one JSON object in the text. - If right brace is not found, we add one to the end of the string. - """ - # NOTE: this regex parsing is taken from langchain.output_parsers.pydantic - text = text.strip().replace("{{", "{").replace("}}", "}") - start = text.find("{") - if start == -1: - raise ValueError(f"No JSON object found in the text: {text}") - - # Attempt to find the matching closing brace - brace_count = 0 - end = -1 - for i in range(start, len(text)): - if text[i] == "{": - brace_count += 1 - elif text[i] == "}": - brace_count -= 1 - - if brace_count == 0: - end = i - break - - if end == -1 and add_missing_right_brace: - # If no closing brace is found, but we are allowed to add one - text += "}" - end = len(text) - 1 - elif end == -1: - raise ValueError( - "Incomplete JSON object found and add_missing_right_brace is False." - ) - - return text[start : end + 1] - - -def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: - """ - Extract the first complete list string from the provided text. If the list string is incomplete - (missing the closing bracket), an option allows adding a closing bracket at the end. - - Args: - text (str): The text containing potential list data. - add_missing_right_bracket (bool): Whether to add a closing bracket if it is missing. - - Returns: - str: The extracted list string. - - Raises: - ValueError: If no list is found or if the list extraction is incomplete - without the option to add a missing bracket. - """ - text = text.strip() - start = text.find("[") - if start == -1: - raise ValueError("No list found in the text.") - - # Attempt to find the matching closing bracket - bracket_count = 0 - end = -1 - for i in range(start, len(text)): - if text[i] == "[": - bracket_count += 1 - elif text[i] == "]": - bracket_count -= 1 - - if bracket_count == 0: - end = i - break - - if end == -1 and add_missing_right_bracket: - # If no closing bracket is found, but we are allowed to add one - text += "]" - end = len(text) - 1 - elif end == -1: - raise ValueError( - "Incomplete list found and add_missing_right_bracket is False." - ) - - return text[start : end + 1] - - -def extract_yaml_str(text: str) -> str: - r"""Extract YAML string from text. - - In default, we use regex pattern to match yaml code blocks within triple backticks with optional yaml or yml prefix. - """ - try: - yaml_re_pattern: re.Pattern = re.compile( - r"^```(?:ya?ml)?(?P[^`]*)", re.MULTILINE | re.DOTALL - ) - match = yaml_re_pattern.search(text.strip()) - - yaml_str = "" - if match: - yaml_str = match.group("yaml") - else: - yaml_str = text - return yaml_str - except Exception as e: - raise ValueError(f"Failed to extract YAML from text: {e}") - - -def fix_json_missing_commas(json_str: str) -> str: - # Example: adding missing commas, only after double quotes - # Regular expression to find missing commas - regex = r'(?<=[}\]"\'\d])(\s+)(?=[\{"\[])' - - # Add commas where missing - fixed_json_str = re.sub(regex, r",\1", json_str) - - return fixed_json_str - - -def fix_json_escaped_single_quotes(json_str: str) -> str: - # First, replace improperly escaped single quotes inside strings - # json_str = re.sub(r"(? Dict[str, Any]: - r""" - Parse a YAML string to a Python object. - yaml_str: has to be a valid YAML string. - """ - try: - import yaml - - yaml_obj = yaml.safe_load(yaml_str) - return yaml_obj - except yaml.YAMLError as e: - raise ValueError( - f"Got invalid YAML object. Error: {e}. Got YAML string: {yaml_str}" - ) - except NameError as exc: - raise ImportError("Please pip install PyYAML.") from exc - - -def parse_json_str_to_obj(json_str: str) -> Dict[str, Any]: - r""" - Parse a JSON string to a Python object. - json_str: has to be a valid JSON string. Either {} or []. - """ - json_str = json_str.strip() - try: - json_obj = json.loads(json_str) - return json_obj - except json.JSONDecodeError: - # 2nd attemp after fixing the json string - try: - print("Trying to fix potential missing commas...") - json_str = fix_json_missing_commas(json_str) - print("Trying to fix scaped single quotes...") - json_str = fix_json_escaped_single_quotes(json_str) - print(f"Fixed JSON string: {json_str}") - json_obj = json.loads(json_str) - return json_obj - except json.JSONDecodeError: - # 3rd attemp using yaml - try: - import yaml - - # NOTE: parsing again with pyyaml - # pyyaml is less strict, and allows for trailing commas - # right now we rely on this since guidance program generates - # trailing commas - print("Parsing JSON string with PyYAML...") - json_obj = yaml.safe_load(json_str) - return json_obj - except yaml.YAMLError as e: - raise ValueError( - f"Got invalid JSON object. Error: {e}. Got JSON string: {json_str}" - ) - except NameError as exc: - raise ImportError("Please pip install PyYAML.") from exc diff --git a/lightrag/core/memory.py b/lightrag/core/memory.py deleted file mode 100644 index 45b424cf..00000000 --- a/lightrag/core/memory.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Memory is more of a db or data type where you can save all users' data and retrieve it when needed. - -We can control if the memory is just per-session or retrieve from the users' all history. - -The main form of the memory is a list of DialogSessions, where each DialogSession is a list of DialogTurns. -When memory becomes too large, we need to (1) compress (2) RAG to retrieve the most relevant memory. - -In this case, we only manage the memory for the current session. -""" - -from lightrag.core.component import Component -from lightrag.core.types import ( - DialogSession, - DialogTurn, - UserQuery, - AssistantResponse, -) - - -class Memory(Component): - def __init__(self): - super().__init__() - self.memory = DialogSession() - - def __call__(self) -> str: - return self.memory.get_chat_history_str() - - def add_dialog_turn(self, user_query: str, assistant_response: str): - self.memory.append_dialog_turn( - DialogTurn( - user_query=UserQuery(user_query), - assistant_response=AssistantResponse(assistant_response), - ) - ) diff --git a/lightrag/core/string_parser.py b/lightrag/core/string_parser.py deleted file mode 100644 index 3cb56ca0..00000000 --- a/lightrag/core/string_parser.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -LLM applications requires lots of string processing. Such as the text output needed to be parsed into: -(1) JSON format or other formats -(2) SQL/Python valid format -(3) Tool(function) call format - -We design this these string_parser modules to be generic to any input text without differentiating them as input text or output text. -""" - -from typing import Any, Dict, List, Tuple -import ast - -from lightrag.core.tool_helper import ToolOutput -from lightrag.core.component import Component -import lightrag.core.functional as F - - -class ListParser(Component): - __doc__ = r"""To extract list strings from text and parse them into a list object. - - Examples: - - .. code-block:: python - - list_parser = ListParser() - test_input_4 = 'Some random text before ["item1", "item2"] and more after' - print(list_parser(test_input_4)) # Expected to extract ["item1", "item2"] - """ - - def __init__(self, add_missing_right_bracket: bool = True): - super().__init__() - self.add_missing_right_bracket = add_missing_right_bracket - - def __call__(self, input: str) -> List[Any]: - input = input.strip() - try: - list_str = F.extract_list_str(input, self.add_missing_right_bracket) - list_obj = F.parse_json_str_to_obj(list_str) - return list_obj - except Exception as e: - raise ValueError(f"Error: {e}") - - -JASON_PARSER_OUTPUT_TYPE = Dict[str, Any] - - -class JsonParser(Component): - __doc__ = r"""To extract JSON strings from text and parse them into a JSON object. - - Examples: - - .. code-block:: python - - json_parser = JsonParser() - json_str = "```json\n{\"key\": \"value\"}\n```" - json_obj = json_parser(json_str) - print(json_obj) # Expected to extract {"key": "value"} - """ - - def __init__(self, add_missing_right_brace: bool = True): - super().__init__() - self.add_missing_right_brace = add_missing_right_brace - - def call(self, input: str) -> JASON_PARSER_OUTPUT_TYPE: - input = input.strip() - try: - json_str = F.extract_json_str(input, self.add_missing_right_brace) - json_obj = F.parse_json_str_to_obj(json_str) - return json_obj - except Exception as e: - raise ValueError(f"Error: {e}") - - -YAML_PARSER_OUTPUT_TYPE = Dict[str, Any] - - -class YAMLParser(Component): - __doc__ = r"""To extract YAML strings from text and parse them into a YAML object. - - Examples: - - .. code-block:: python - - yaml_parser = YAMLParser() - yaml_str = "```yaml\nkey: value\n```" - yaml_obj = yaml_parser(yaml_str) - print(yaml_obj) # Expected to extract {"key": "value"} - """ - - def __init__(self): - super().__init__() - - def call(self, input: str) -> YAML_PARSER_OUTPUT_TYPE: - input = input.strip() - try: - yaml_str = F.extract_yaml_str(input) - yaml_obj = F.parse_yaml_str_to_obj(yaml_str) - return yaml_obj - except Exception as e: - raise ValueError(f"Error: {e}") - - -############################################################################################################ -# String as function call -############################################################################################################ -def evaluate_ast_node(node: ast.AST, context_map: Dict[str, Any] = None): - """ - Recursively evaluates an AST node and returns the corresponding Python object. - - Args: - node (ast.AST): The AST node to evaluate. This node can represent various parts of Python expressions, - such as literals, identifiers, lists, dictionaries, and function calls. - context_map (Dict[str, Any]): A dictionary that maps variable names to their respective values and functions. - This context is used to resolve names and execute functions. - - Returns: - Any: The result of evaluating the node. The type of the returned object depends on the nature of the node: - - Constants return their literal value. - - Names are looked up in the context_map. - - Lists and tuples return their contained values as a list or tuple. - - Dictionaries return a dictionary with keys and values evaluated. - - Function calls invoke the function with evaluated arguments and return its result. - - Raises: - ValueError: If the node type is unsupported, a ValueError is raised indicating the inability to evaluate the node. - """ - if isinstance(node, ast.Constant): - return node.value - elif isinstance(node, ast.Dict): - return { - evaluate_ast_node(k): evaluate_ast_node(v) - for k, v in zip(node.keys, node.values) - } - elif isinstance(node, ast.List): - return [evaluate_ast_node(elem) for elem in node.elts] - elif isinstance(node, ast.Tuple): - return tuple(evaluate_ast_node(elem) for elem in node.elts) - elif isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub): - return -evaluate_ast_node(node.operand, context_map) # unary minus - elif isinstance( - node, ast.BinOp - ): # support "multiply(2024-2017, 12)", the "2024-2017" is a "BinOp" node - left = evaluate_ast_node(node.left, context_map) - right = evaluate_ast_node(node.right, context_map) - if isinstance(node.op, ast.Add): - return left + right - elif isinstance(node.op, ast.Sub): - return left - right - elif isinstance(node.op, ast.Mult): - return left * right - elif isinstance(node.op, ast.Div): - return left / right - elif isinstance(node.op, ast.Mod): - return left % right - elif isinstance(node.op, ast.Pow): - return left**right - else: - raise ValueError(f"Unsupported binary operator: {type(node.op)}") - elif isinstance(node, ast.Name): # variable name - try: - output_fun = context_map[node.id] - return output_fun - # TODO: raise the error back to the caller so that the llm can get the error message - except KeyError as e: - raise ValueError( - f"Error: {e}, {node.id} does not exist in the context_map." - ) - - elif isinstance( - node, ast.Call - ): # another fun or class as argument and value, e.g. add( multiply(4,5), 3) - func = evaluate_ast_node(node.func, context_map) - args = [evaluate_ast_node(arg, context_map) for arg in node.args] - kwargs = { - kw.arg: evaluate_ast_node(kw.value, context_map) for kw in node.keywords - } - print(f"another fun or class as argument and value: {func}, {args}, {kwargs}") - output = func(*args, **kwargs) - if isinstance(output, ToolOutput): - return output.raw_output - print(f"output: {output}") - return output - else: - raise ValueError(f"Unsupported AST node type: {type(node)}") - - -def parse_function_call( - call_string: str, context_map: Dict[str, Any] = None -) -> Tuple[str, List[Any], Dict[str, Any]]: - """ - Parse a string representing a function call into its components and ensure safe execution by only allowing function calls from a predefined context map. - Args: - call_string (str): The string representing the function call. - context_map (Dict[str, Any]): A dictionary that maps variable names to their respective values and functions. - This context is used to resolve names and execute functions. - """ - call_string = call_string.strip() - # Parse the string into an AST - tree = ast.parse(call_string, mode="eval") - - if isinstance(tree.body, ast.Call): - # Extract the function name - func_name = tree.body.func.id if isinstance(tree.body.func, ast.Name) else None - - # Prepare the list of arguments and keyword arguments - args = [evaluate_ast_node(arg, context_map) for arg in tree.body.args] - keywords = { - kw.arg: evaluate_ast_node(kw.value, context_map) - for kw in tree.body.keywords - } - - return func_name, args, keywords - else: - raise ValueError("Provided string is not a function call.") diff --git a/lightrag/core/tool_helper.py b/lightrag/core/tool_helper.py deleted file mode 100644 index 2f960aa8..00000000 --- a/lightrag/core/tool_helper.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Tool is LLM's extended capability which is one of the core design pattern of Agent. All tools can be wrapped in a FunctionTool class. -This helps to standardize the tool interface and metadata to communicate with the Agent. -""" - -from typing import Any, Optional, Dict, Callable, Awaitable -from inspect import iscoroutinefunction, signature, Parameter - -from abc import ABC, abstractmethod -import json - -AsyncCallable = Callable[..., Awaitable[Any]] - - -############################################## -# Tool data classes, using BaseModel to auto-generate schema -# Simplified version of LlamaIndex's BaseTool -############################################## -class ToolOutput(ABC): - str_content: Optional[str] = None # Initially allow str_content to be optional - - name: Optional[str] = None - raw_input: Dict[str, Any] - raw_output: Any - - def __init__(self, **data): - if "str_content" not in data or data["str_content"] is None: - data["str_content"] = str(data["raw_output"]) - # super().__init__(**data) - self.str_content = data.get("str_content", None) - self.name = data.get("name", None) - self.raw_input = data.get("raw_input", {}) - self.raw_output = data.get("raw_output", None) - - def __str__(self) -> str: - return str(self.str_content) - - -class ToolMetadata(ABC): - """ - Metadata for a tool. Can be passed to LLM for tool registration. - """ - - description: str - name: Optional[str] = None # TODO: make it a class_name - parameters: Dict[str, Any] = {} - - def __init__(self, **data): - # initialize here - self.name = data.get("name", None) - self.description = data.get("description", "") - self.parameters = data.get("parameters", {}) - - def get_parameters_dict(self) -> dict: - parameters = { - k: v - for k, v in self.parameters.items() - if k in ["type", "properties", "required", "definitions"] - } - return parameters - - @property - def tool_str(self) -> str: - """ - Return a string representation of the tool. - """ - return self.description - - @property - def fn_schema_str(self) -> str: - parameters = self.get_parameters_dict() - return json.dumps(parameters) - - def get_name(self) -> str: - if self.name is None: - raise ValueError("name is None.") - return self.name - - -def get_fun_schema(name: str, func: Callable[..., Any]) -> Dict[str, Any]: - r"""Get the schema of a function. - Examples: - def example_function(x: int, y: str = "default") -> int: - return x - schema = get_fun_schema("example_function", example_function) - print(json.dumps(schema, indent=4)) - # Output: - { - "type": "object", - "properties": { - "x": { - "type": "int" - }, - "y": { - "type": "str", - "default": "default" - } - }, - "required": [ - "x" - ] - } - """ - sig = signature(func) - schema = {"type": "object", "properties": {}, "required": []} - - for name, parameter in sig.parameters.items(): - param_type = ( - parameter.annotation.__name__ - if parameter.annotation != Parameter.empty - else "Any" - ) - if parameter.default == Parameter.empty: - schema["required"].append(name) - schema["properties"][name] = {"type": param_type} - else: - schema["properties"][name] = { - "type": param_type, - "default": parameter.default, - } - # add definitions if nested model exists - if hasattr(parameter.annotation, "__annotations__"): - schema["definitions"] = {name: get_fun_schema(name, parameter.annotation)} - - return schema - - -############################################## -# FunctionTool -############################################## -import dataclasses - - -@dataclasses.dataclass -class FunctionTool: - """ - There is almost no need to customize a FunctionTool, but you can do so if you want to. - Support both positional and keyword arguments. - NOTE: - - at least one of fn or async_fn must be provided. - - When both are provided, sync (call) will be used in __call__. - """ - - def __init__( - self, - metadata: ToolMetadata, - fn: Optional[ - Callable[..., Any] - ] = None, # at least one of fn or async_fn must be provided - async_fn: Optional[AsyncCallable] = None, - ) -> None: - self._fn = None - self._async_fn = None - if fn: - self._fn = fn - elif async_fn: - if not iscoroutinefunction(async_fn): - raise ValueError("async_fn must be an asynchronous function") - self._async_fn = async_fn - - else: - raise ValueError("At least one of fn or async_fn must be provided") - - self._metadata = metadata - - def __repr__(self) -> str: - return f"FunctionTool({self.metadata.name})" - - def __str__(self) -> str: - return f"FunctionTool({self.metadata.name})" - - @classmethod - def from_defaults( - cls, - fn: Optional[ - Callable[..., Any] - ] = None, # at least one of fn or async_fn must be provided - async_fn: Optional[AsyncCallable] = None, - name: Optional[str] = None, - description: Optional[ - str - ] = None, # if not provided, use function name, signature and docstring - tool_metadata: Optional[ToolMetadata] = None, - ) -> "FunctionTool": - if tool_metadata is None: - name = name or fn.__name__ - docstring = fn.__doc__ - # sample_function(x, y, user: tests.test_tool.User = User(id=1, name='John')) - # two numbers together and returns the sum. - description = description or f"{name}{signature(fn)}\n{docstring}" - - # fn_parameters are more readable than the above name, signature and docstring combination - fn_parameters = get_fun_schema(name, fn) - - tool_metadata = ToolMetadata( - name=name, description=description, parameters=fn_parameters - ) - return cls(fn=fn, metadata=tool_metadata, async_fn=async_fn) - - @property - def metadata(self) -> ToolMetadata: - return self._metadata - - @property - def fn(self) -> Callable[..., Any]: - return self._fn - - @property - def async_fn(self) -> AsyncCallable: - return self._async_fn - - def __call__(self, *args: Any, **kwargs: Any) -> ToolOutput: - return self.call(*args, **kwargs) - - def call(self, *args: Any, **kwargs: Any) -> ToolOutput: - tool_output = self._fn(*args, **kwargs) - return ToolOutput( - tool_name=self.metadata.name, - raw_input={"args": args, "kwargs": kwargs}, - raw_output=tool_output, - ) - - async def acall(self, *args: Any, **kwargs: Any) -> ToolOutput: - tool_output = await self._async_fn(*args, **kwargs) - return ToolOutput( - tool_name=self.metadata.name, - raw_input={"args": args, "kwargs": kwargs}, - raw_output=tool_output, - ) diff --git a/lightrag/lightrag/__init__.py b/lightrag/lightrag/__init__.py new file mode 100644 index 00000000..d33bab7c --- /dev/null +++ b/lightrag/lightrag/__init__.py @@ -0,0 +1,3 @@ +from lightrag.utils import setup_env + +setup_env() diff --git a/lightrag/database/__init__.py b/lightrag/lightrag/components/__init__.py similarity index 100% rename from lightrag/database/__init__.py rename to lightrag/lightrag/components/__init__.py diff --git a/lightrag/lightrag/components/agent/README.md b/lightrag/lightrag/components/agent/README.md new file mode 100644 index 00000000..cb892e69 --- /dev/null +++ b/lightrag/lightrag/components/agent/README.md @@ -0,0 +1,46 @@ + +# Agent is not a model or LLM model. +# Agent is better defined as a system that uses LLM models to plan and replan steps that each involves the usage of various tools, +# such as function calls, another LLM model based on the context and history (memory) to complete a task autonomously. + + +# REact agent can be useful for +# - Multi-hop reasoning [Q&A], including dividing the query into subqueries and answering them one by one. +# - Plan the usage of the given tools: highly flexible. Retriever, Generator modules or any other functions can all be wrapped as tools. + +# The initial ReAct paper does not support different types of tools. We have greatly extended the flexibility of tool adaption, even including an llm tool +# to answer questions that cant be answered or better be answered by llm using its world knowledge. +# - Every react agent can be given a different tasks, different tools, and different LLM models to complete the task. +# - 'finish' tool is defined to finish the task by joining all subqueries answers. + +# Reference: +# [1] LLM Agent survey: https://github.com/Paitesanshi/LLM-Agent-Survey +Agent is not a model or LLM model. + +Agent is better defined as a system that uses LLM models to plan and replan steps that each involves the usage of various tools, +such as function calls, another LLM model based on the context and history (memory) to complete a task autonomously. + +The future: the agent can write your prompts too. Check out dspy: https://github.com/stanfordnlp/dspy + +In this directory, we add the general design patterns of agent, here are four (Thanks to Andrew Ng): + +1️⃣ Reflection + +- Self-Refine: Iterative Refinement with Self-Feedback +- Reflexion: Language Agents with Verbal Reinforcement Learning + +2️⃣ Tool use + +- Gorilla: Large Language Model Connected with Massive APIs +- MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action + +3️⃣ Planning + +- Chain-of-Thought Prompting Elicits Reasoning in Large Language Models +- HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face +- React + +4️⃣ Multi-agent collaboration + +- Communicative Agents for Software Development +- AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation diff --git a/lightrag/lightrag/components/agent/__init__.py b/lightrag/lightrag/components/agent/__init__.py new file mode 100644 index 00000000..5f5c5844 --- /dev/null +++ b/lightrag/lightrag/components/agent/__init__.py @@ -0,0 +1,10 @@ +from .react import DEFAULT_REACT_AGENT_SYSTEM_PROMPT, ReactAgent +from lightrag.utils.registry import EntityMapping + +__all__ = [ + "ReactAgent", + "DEFAULT_REACT_AGENT_SYSTEM_PROMPT", +] + +for name in __all__: + EntityMapping.register(name, globals()[name]) diff --git a/lightrag/lightrag/components/agent/react.py b/lightrag/lightrag/components/agent/react.py new file mode 100644 index 00000000..55d5cb62 --- /dev/null +++ b/lightrag/lightrag/components/agent/react.py @@ -0,0 +1,427 @@ +"""Implementation of ReAct.""" + +from typing import List, Union, Callable, Optional, Any, Dict +from copy import deepcopy +import logging + + +from lightrag.core.generator import Generator +from lightrag.core.component import Component +from lightrag.core.func_tool import FunctionTool, AsyncCallable +from lightrag.core.tool_manager import ToolManager +from lightrag.components.output_parsers import JsonOutputParser +from lightrag.core.types import ( + StepOutput, + GeneratorOutput, + Function, + FunctionOutput, + FunctionExpression, +) +from lightrag.core.model_client import ModelClient +from lightrag.utils.logger import printc + + +log = logging.getLogger(__name__) + +DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<> +{# role/task description #} +You are a helpful assistant. +Answer the user's query using the tools provided below with minimal steps and maximum accuracy. +{# REACT instructions #} +Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action. +{# Tools #} +{% if tools %} + +You available tools are: +{# tools #} +{% for tool in tools %} +{{ loop.index }}. +{{tool}} +------------------------ +{% endfor %} + +{% endif %} +{# output is always more robust to use json than string #} + +{{output_format_str}} + + +{# Specifications TODO: preference between the usage of llm tool vs the other tool #} +- For simple queries: Directly call the ``finish`` action and provide the answer. +- For complex queries: + - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery. + - Call one available tool at a time to solve each subquery/subquestion. \ + - At step 'finish', join all subqueries answers and finish the task. +Remember: +- Action must call one of the above tools with name. It can not be empty. +- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message. + +{#Examples can be here#} +{# Check if there are any examples #} +{% if examples %} + +{% for example in examples %} +{{ example }} +{% endfor %} + +{% endif %} + +<> +----------------- +{% if input_str %} +User query: +{{ input_str }} +{% endif %} +{# Step History #} +{% if step_history %} + +{% for history in step_history %} +Step {{ loop.index }}. +{ + "thought": "{{history.thought}}", + "action": "{{history.action.action}}", +} +"Observation": "{{history.observation}}" +------------------------ +{% endfor %} + +{% endif %} +""" + + +class ReActAgent(Component): + __doc__ = r"""ReActAgent uses generator as a planner that runs multiple and sequential functional call steps to generate the final response. + + Users need to set up: + - tools: a list of tools to use to complete the task. Each tool is a function or a function tool. + - max_steps: the maximum number of steps the agent can take to complete the task. + - use_llm_as_fallback: a boolean to decide whether to use an additional LLM model as a fallback tool to answer the query. + - model_client: the model client to use to generate the response. + - model_kwargs: the model kwargs to use to generate the response. + + For the generator, the default arguments are: + (1) default prompt: DEFAULT_REACT_AGENT_SYSTEM_PROMPT + (2) default output_processors: JsonParser + + There are `examples` which is optional, a list of string examples in the prompt. + + Example: + + .. code-block:: python + from core.openai_client import OpenAIClient + from components.agent.react import ReActAgent + from core.func_tool import FunctionTool + # define the tools + def multiply(a: int, b: int) -> int: + '''Multiply two numbers.''' + return a * b + def add(a: int, b: int) -> int: + '''Add two numbers.''' + return a + b + agent = ReActAgent( + tools=[multiply, add], + model_client=OpenAIClient, + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + Using examples: + + preset_prompt_kwargs = {"examples": examples} + agent = ReActAgent( + tools=[multiply, add], + model_client=OpenAIClient, + model_kwargs={"model": "gpt-3.5-turbo"}, + preset_prompt_kwargs=preset_prompt_kwargs, + ) + + Reference: + [1] https://arxiv.org/abs/2210.03629, published in Mar, 2023. + """ + + def __init__( + self, + # added arguments specifc to React + tools: List[Union[Callable, AsyncCallable, FunctionTool]] = [], + max_steps: int = 10, + add_llm_as_fallback: bool = True, + *, + # the following arguments are mainly for the planner + model_client: ModelClient, + model_kwargs: Dict = {}, + ): + super().__init__() + template = DEFAULT_REACT_AGENT_SYSTEM_PROMPT + + self.max_steps = max_steps + + self.add_llm_as_fallback = add_llm_as_fallback + + self._init_tools(tools, model_client, model_kwargs) + + ouput_data_class = FunctionExpression + example = FunctionExpression.from_function( + thought="I have finished the task.", + func=self._finish, + answer="final answer: 'answer'", + ) + output_parser = JsonOutputParser(data_class=ouput_data_class, example=example) + prompt_kwargs = { + "tools": self.tool_manager.yaml_definitions, + "output_format_str": output_parser.format_instructions(), + } + self.planner = Generator( + template=template, + prompt_kwargs=prompt_kwargs, + output_processors=output_parser, + model_client=model_client, + model_kwargs=model_kwargs, + ) + + self.step_history: List[StepOutput] = [] + + def _init_tools( + self, + tools: List[Union[Callable, AsyncCallable, FunctionTool]], + model_client: ModelClient, + model_kwargs: Dict, + ): + r"""Initialize the tools.""" + tools = deepcopy(tools) + _additional_llm_tool = ( + Generator(model_client=model_client, model_kwargs=model_kwargs) + if self.add_llm_as_fallback + else None + ) + + def llm_tool(input: str) -> str: + """I answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.""" + # use the generator to answer the query + try: + output: GeneratorOutput = _additional_llm_tool( + prompt_kwargs={"input_str": input} + ) + response = output.data if output else None + return response + except Exception as e: + log.error(f"Error using the generator: {e}") + print(f"Error using the generator: {e}") + + return None + + def finish(answer: str) -> str: + """Finish the task with answer.""" + return answer + + self._finish = finish + + if self.add_llm_as_fallback: + tools.append(llm_tool) + tools.append(finish) + self.tool_manager = ToolManager(tools=tools) + + def reset(self): + r"""Reset the agent to start a new query.""" + self.step_history = [] + + def _execute_action(self, action_step: StepOutput) -> Optional[StepOutput]: + """ + Parse the action string to a function call and execute it. Update the action_step with the result. + """ + action = action_step.action + try: + + fun: Function = self.tool_manager.parse_function_call_expr(action) + result: FunctionOutput = self.tool_manager.execute_function(fun) + # TODO: optimize the action_step + action_step.fun_name = fun.name + action_step.fun_args = fun.args + action_step.fun_kwargs = fun.kwargs + action_step.observation = result.output + return action_step + except Exception as e: + log.error(f"Error executing {action}: {e}") + # pass the error as observation so that the agent can continue and correct the error in the next step + action_step.observation = f"Error executing {action}: {e}" + return action_step + + def _run_one_step(self, step: int, prompt_kwargs: Dict, model_kwargs: Dict) -> str: + """ + Run one step of the agent. + """ + # step_history is the only per-query variable, and should not be controlled by the user + # add the step_history to the prompt_kwargs + prompt_kwargs["step_history"] = self.step_history + + log.debug( + f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}" + ) + + # call the super class Generator to get the response + response: GeneratorOutput = self.planner( + prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs + ) + step_output: StepOutput = None + try: + fun_expr: FunctionExpression = FunctionExpression.from_dict(response.data) + step_output = StepOutput( + step=step, thought=fun_expr.thought, action=fun_expr + ) + # print the func expr + log.debug(f"Step {step}: {fun_expr}") + + # execute the action + if step_output and step_output.action: + step_output = self._execute_action(step_output) + printc(f"Step {step}: \n{step_output}\n_______\n", color="blue") + else: + log.error(f"Failed to parse response for step {step}") + except Exception as e: + log.error(f"Error running step {step}: {e}") + if step_output is None: + step_output = StepOutput(step=step, thought="", action="") + else: + step_output.observation = f"Error running step {step}: {e}" + self.step_history.append(step_output) + + return response + + def call( + self, + input: str, + promt_kwargs: Optional[Dict] = {}, + model_kwargs: Optional[Dict] = {}, + ) -> Any: + r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs.""" + prompt_kwargs = {**promt_kwargs, "input_str": input} + # prompt_kwargs["input_str"] = input + printc(f"input_query: {input}", color="red") + for i in range(self.max_steps): + step = i + 1 + try: + self._run_one_step(step, prompt_kwargs, model_kwargs) + if ( + self.step_history[-1].fun_name + and self.step_history[-1].fun_name == "finish" + ): + break + except Exception as e: + log.error(f"Error running step {step}: {e}") + + answer = self.step_history[-1].observation + printc(f"answer:\n {answer}", color="green") + log.info(f"step_history: {self.step_history}") + self.reset() + return answer + + def _extra_repr(self) -> str: + s = f"max_steps={self.max_steps}, add_llm_as_fallback={self.add_llm_as_fallback}" + s += super()._extra_repr() + return s + + +if __name__ == "__main__": + from components.model_client import GroqAPIClient + from lightrag.core.types import ModelClientType + from lightrag.utils import setup_env # noqa + + # from lightrag.utils import enable_library_logging + + # enable_library_logging(level="DEBUG") + + def multiply(a: int, b: int) -> int: + """ + Multiply two numbers. + """ + return a * b + + def add(a: int, b: int) -> int: + """ + Add two numbers. + """ + return a + b + + def divide(a: float, b: float) -> float: + """ + Divide two numbers. + """ + return float(a) / b + + def search(query: str) -> str: + """ + Search the web for the given query. + """ + return "python programming is a great way to learn programming" + + tools = [ + FunctionTool(fn=multiply), + FunctionTool(fn=add), + FunctionTool(fn=divide), + # FunctionTool.from_defaults(fn=search), + ] + llm_model_kwargs = { + "model": "llama3-70b-8192", # llama3 is not good with string formatting, llama3 8b is also bad at following instruction, 70b is better but still not as good as gpt-3.5-turbo + # mistral also not good: mixtral-8x7b-32768, but with better prompt, it can still work + "temperature": 0.0, + } + + gpt_3_5_turbo_model_kwargs = { + "model": "gpt-3.5-turbo", + } + + examples = [ + # r""" + # User: What is 9 - 3? + # You: { + # "thought": "I need to subtract 3 from 9, but there is no subtraction tool, so I ask llm_tool to answer the query.", + # "action": "llm_tool('What is 9 - 3?')" + # } + # """ + ] + # agent = ReActAgent( + # # examples=examples, + # tools=tools, + # max_steps=5, + # model_client=GroqAPIClient, + # model_kwargs=llm_model_kwargs, + # ) + # print(agent) + queries = [ + # "What is 2 times 3?", + # "What is 3 plus 4?", + "What is the capital of France? and what is 465 times 321 then add 95297 and then divide by 13.2?", + # "Li adapted her pet Apple in 2017 when Apple was only 2 months old, now we are at year 2024, how old is Li's pet Apple?", + "Give me 5 words rhyming with cool, and make a 4-sentence poem using them", + ] + """ + Results: mixtral-8x7b-32768, 0.9s per query + llama3-70b-8192, 1.8s per query + gpt-3.5-turbo, 2.2s per query + """ + import time + + generator = Generator( + model_client=GroqAPIClient(), + model_kwargs=llm_model_kwargs, + ) + # for i in range(3): + agent = ReActAgent( + tools=tools, + max_steps=5, + model_client=ModelClientType.GROQ(), + model_kwargs=llm_model_kwargs, + ) + # agent.llm_planner.print_prompt() + # print(agent) + + # vs not using agent + # print(agent.tools) + + average_time = 0 + for query in queries: + t0 = time.time() + answer = agent(query) + average_time += time.time() - t0 + answer_no_agent = generator(prompt_kwargs={"input_str": query}) + print(f"Answer with agent: {answer}") + print(f"Answer without agent: {answer_no_agent}") + print(f"Average time: {average_time / len(queries)}") diff --git a/lightrag/components/data_process/__init__.py b/lightrag/lightrag/components/data_process/__init__.py similarity index 67% rename from lightrag/components/data_process/__init__.py rename to lightrag/lightrag/components/data_process/__init__.py index c88b3389..b50494d5 100644 --- a/lightrag/components/data_process/__init__.py +++ b/lightrag/lightrag/components/data_process/__init__.py @@ -1,11 +1,11 @@ """Components here are used for data processing/transformation.""" -from .document_splitter import DocumentSplitter +from .text_splitter import TextSplitter from .data_components import ToEmbeddings, RetrieverOutputToContextStr from lightrag.utils.registry import EntityMapping -__all__ = ["DocumentSplitter", "ToEmbeddings", "RetrieverOutputToContextStr"] +__all__ = ["TextSplitter", "ToEmbeddings", "RetrieverOutputToContextStr"] for name in __all__: EntityMapping.register(name, globals()[name]) diff --git a/lightrag/components/data_process/data_components.py b/lightrag/lightrag/components/data_process/data_components.py similarity index 100% rename from lightrag/components/data_process/data_components.py rename to lightrag/lightrag/components/data_process/data_components.py diff --git a/lightrag/components/data_process/text_splitter.py b/lightrag/lightrag/components/data_process/text_splitter.py similarity index 50% rename from lightrag/components/data_process/text_splitter.py rename to lightrag/lightrag/components/data_process/text_splitter.py index ad7bd7a6..6d19b1db 100644 --- a/lightrag/components/data_process/text_splitter.py +++ b/lightrag/lightrag/components/data_process/text_splitter.py @@ -21,7 +21,7 @@ from lightrag.core.component import Component from lightrag.core.types import Document -from lightrag.components.retriever.bm25_retriever import split_text_tokenized +from lightrag.core.tokenizer import Tokenizer # TODO: # More splitters such as PDF/JSON/HTML Splitter can be built on TextSplitter. @@ -32,53 +32,67 @@ DocumentSplitterOutputType = List[Document] # customizable seperators map -SEPARATORS = {"page": "\f", "passage": "\n\n", "word": " ", "sentence": ".", "token": ""} +SEPARATORS = { + "page": "\f", + "passage": "\n\n", + "word": " ", + "sentence": ".", + "token": "", +} + +DEFAULT_CHUNK_SIZE = 800 +DEFAULT_CHUNK_OVERLAP = 200 + +tokenizer = Tokenizer() -DEFAULT_CHUNK_SIZE = 1024 -DEFAULT_CHUNK_OVERLAP = 20 class TextSplitter(Component): - """ - Text Splitter for Chunking Documents in Batch - - The ``TextSplitter`` is designed for splitting plain text into manageable chunks. - It supports 2 types of splitting. - - * Type 1: Specify the exact text splitting point such as space<" "> and periods<".">. It is intuitive: - "Hello, world!" -> ["Hello, " ,"world!"] - - * Type 2: Use :class:`tokenizer `. It works as: - "Hello, world!" -> ['Hello', ',', ' world', '!'] - - .. note:: - The punctuation is considered as a token. - - This aligns with how models see text in the form of tokens. (`Reference `_) - - Simple text splitting(Type 1) can underestimate the number of tokens. Tokenizer reflects the real token numbers the models take in. - But the Tokenizer here only works at world level. - - * **Definitions** - - ``split_by``: Specifies the text-splitting criterion using predefined keys like "word", "sentence", "page", "passage", and "token". The splitter utilizes the corresponding separator from the ``SEPARATORS`` dictionary. - - ``SEPARATORS``: Maps ``split_by`` criterions to their exact text separators, e.g., spaces<" "> for "word" or periods<"."> for "sentence". - - Usage: **SEPARATORS[``split_by``]=separator** - - .. note:: - For option ``token``, its separator is "" because we directly split by a tokenizer, instead of text point. - - * **Overview**: + """ + Text Splitter for Chunking Documents + ``TextSplitter`` first utilizes ``split_by`` to specify the text-splitting criterion and breaks the long text into smaller texts. Then we create a sliding window with length= ``chunk_size``. It moves at step= ``chunk_size`` - ``chunk_overlap``. - The texts inside each window will get concatenated to a smaller chunk. The generated chunks from the splitted text will be returned. - + The texts inside each window will get merged to a smaller chunk. The generated chunks from the splitted text will be returned. + + **Splitting Types** + + ``TextSplitter`` supports 2 types of splitting. + + * **Type 1:** Specify the exact text splitting point such as space<" "> and periods<".">. It is intuitive, for example, split_by "word": + + :: + + "Hello, world!" -> ["Hello, " ,"world!"] + + * **Type 2:** Use :class:`tokenizer `. It works as: + + :: + + "Hello, world!" -> ['Hello', ',', ' world', '!'] + + This aligns with how models see text in the form of tokens (`Reference `_), + Tokenizer reflects the real token numbers the models take in and helps the developers control budgets. + + **Definitions** + + * **split_by** specifies the split rule, i.e. the smallest unit during splitting. We support ``"word"``, ``"sentence"``, ``"page"``, ``"passage"``, and ``"token"``. The splitter utilizes the corresponding separator from the ``SEPARATORS`` dictionary. + For Type 1 splitting, we apply ``Python str.split()`` to break the text. + + * **SEPARATORS**: Maps ``split_by`` criterions to their exact text separators, e.g., spaces <" "> for "word" or periods <"."> for "sentence". + + .. note:: + For option ``token``, its separator is "" because we directly split by a tokenizer, instead of text point. + + * **chunk_size** is the the maximum number of units in each chunk. + + * **chunk_overlap** is the number of units that each chunk should overlap. Including context at the borders prevents sudden meaning shift in text between sentences/context, especially in sentiment analysis. + + * **Splitting Details** - Type 1: - The ``TextSplitter`` utilizes Python's ``str.split(separator)`` method. - Developers can refer to - + Type 1: + The ``TextSplitter`` utilizes Python's ``str.split(separator)`` method. + Developers can refer to + .. code-block:: none { @@ -88,97 +102,73 @@ class TextSplitter(Component): "sentence": "." } for exact points of text division. - + .. note:: Developers need to determine how to assign text to each data chunk for the embedding and retrieval tasks. - The ``TextSplitter`` ``split_by`` cases: - - - "word": Splits the text at every space (" "), treating spaces as the boundaries between words. - - - "sentence": Splits the text at every period ("."), treating these as the ends of sentences. - - - "page": Splits the text at form feed characters ("\\f"), which are often used to represent page breaks in documents. - - - "passage": Splits the text at double newline characters ("\\n\\n"), useful for distinguishing between paragraphs or sections. Type 2: We implement a tokenizer using ``cl100k_base`` encoding that aligns with how models see text in the form of tokens. E.g. "tiktoken is great!" -> ["t", "ik", "token", " is", " great", "!"] This helps developers control the token usage and budget better. - - - * **Customization** - You can also customize the ``SEPARATORS``. For example, by defining ``SEPARATORS`` = {"question": "?"} and setting ``split_by`` = "question", the document will be split at each ``?``, ideal for processing text structured - as a series of questions. If you need to customize :class:`tokenizer `, please check `Reference `_. - - * **Concatenating Details** + + * **Merge Details** Type 1/Type 2 create a list of split texts. ``TextSplitter`` then reattaches the specified separator to each piece of the split text, except for the last segment. This approach maintains the original spacing and punctuation, which is critical in contexts like natural language processing where text formatting can impact interpretations and outcomes. E.g. "hello world!" split by "word" will be kept as "hello " and "world!" - - * **Use Cases** + + * **Customization** + You can also customize the ``SEPARATORS``. For example, by defining ``SEPARATORS`` = {"question": "?"} and setting ``split_by`` = "question", the document will be split at each ``?``, ideal for processing text structured + as a series of questions. If you need to customize :class:`tokenizer `, please check `Reference `_. + + * **Integration with Other Document Types** This functionality is ideal for segmenting texts into sentences, words, pages, or passages, which can then be processed further for NLP applications. - - To handle PDF content, developers need to first extract the text using tools like ``PyPDF2`` or ``PDFMiner`` before splitting. - + For **PDFs**, developers will need to extract the text before using the splitter. Libraries like ``PyPDF2`` or ``PDFMiner`` can be utilized for this purpose. + ``LightRAG``'s future implementations will introduce splitters for ``JSON``, ``HTML``, ``markdown``, and ``code``. + Example: - .. code-block:: python - - from lightrag.core.text_splitter import TextSplitter - from lightrag.core.types import Document - - # configure the splitter setting - text_splitter_settings = { - "split_by": "word", - "chunk_size": 20, - "chunk_overlap": 2, - } - - # set up the document splitter - text_splitter = TextSplitter( - split_by=text_splitter_settings["split_by"], - chunk_size=text_splitter_settings["chunk_size"], - chunk_overlap=text_splitter_settings["chunk_overlap"], - ) - doc1 = Document( - meta_data={"title": "Luna's Profile"}, - text="lots of more nonsense text." * 2 - + "Luna is a domestic shorthair." - + "lots of nonsense text." * 3, - id="doc1", - ) - doc2 = Document( - meta_data={"title": "Luna's Hobbies"}, - text="lots of more nonsense text." * 2 - + "Luna loves to eat lickable treats." - + "lots of more nonsense text." * 2 - + "Luna loves to play cat wand." - + "lots of more nonsense text." * 2 - + "Luna likes to sleep all the afternoon", - id="doc2", - ) - documents = [doc1, doc2] - - splitted_docs = (text_splitter.call(documents=documents)) - - for doc in splitted_docs: - print("*" * 50) - print(doc) - print("*" * 50) + .. code-block:: python + + from lightrag.components.data_process.text_splitter import TextSplitter + from lightrag.core.types import Document + + # Configure the splitter settings + text_splitter = TextSplitter( + split_by="word", + chunk_size=5, + chunk_overlap=1 + ) + + # Example document + doc = Document( + text="Example text. More example text. Even more text to illustrate.", + id="doc1" + ) + + # Execute the splitting + splitted_docs = text_splitter.call(documents=[doc]) + + for doc in splitted_docs: + print(doc) + + # Output: + # Document(id=44a8aa37-0d16-40f0-9ca4-2e25ae5336c8, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None) + # Document(id=ca0af45b-4f88-49b5-97db-163da9868ea4, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None) + # Document(id=e7b617b2-3927-4248-afce-ec0fc247ac8b, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None) """ + def __init__( self, split_by: Literal["word", "sentence", "page", "passage", "token"] = "word", chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, - batch_size: int = 1000 + batch_size: int = 1000, ): """ Initializes the TextSplitter with the specified parameters for text splitting. Args: - split_by (str): The specific criterion to use for splitting the text. - Valid options are 'word' to split by ' ', 'sentence' to split by '.', + split_by (str): The specific criterion to use for splitting the text. + Valid options are 'word' to split by ' ', 'sentence' to split by '.', 'page' to split by '\\f', 'passage' to split by '\\n\\n'. chunk_size (int): The size of chunks to generate after splitting. Must be greater than 0. chunk_overlap (int): The number of characters of overlap between chunks. Must be non-negative @@ -190,35 +180,35 @@ def __init__( """ super().__init__() - # variable value checks self.split_by = split_by - if split_by not in SEPARATORS: - options = ", ".join(f"'{key}'" for key in SEPARATORS.keys()) - log.error(f"Invalid options for split_by. You must select from {options}.") - raise ValueError(f"Invalid options for split_by. You must select from {options}.") - - if chunk_overlap >= chunk_size: - log.error(f"chunk_overlap can't be larger than or equal to chunk_size. Received chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}") - raise ValueError( - f"chunk_overlap can't be larger than or equal to chunk_size. Received chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}" - ) - - if chunk_size <= 0: - log.error(f"chunk_size must be greater than 0. Received value: {chunk_size}") - raise ValueError(f"chunk_size must be greater than 0. Received value: {chunk_size}") + assert ( + split_by in SEPARATORS + ), f"Invalid options for split_by. You must select from {list(SEPARATORS.keys())}." + + assert ( + chunk_overlap < chunk_size + ), f"chunk_overlap can't be larger than or equal to chunk_size. Received chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}" + + assert ( + chunk_size > 0 + ), f"chunk_size must be greater than 0. Received value: {chunk_size}" self.chunk_size = chunk_size - - if chunk_overlap < 0: - log.error(f"chunk_overlap must be non-negative. Received value: {chunk_overlap}") - raise ValueError(f"chunk_overlap must be non-negative. Received value: {chunk_overlap}") - self.chunk_overlap = chunk_overlap - + + assert ( + chunk_overlap >= 0 + ), f"chunk_overlap must be non-negative. Received value: {chunk_overlap}" + self.chunk_overlap = chunk_overlap + self.batch_size = batch_size + log.info( + f"Initialized TextSplitter with split_by={self.split_by}, chunk_size={self.chunk_size}, chunk_overlap={self.chunk_overlap}, batch_size={self.batch_size}" + ) + def split_text(self, text: str) -> List[str]: """ Splits the provided text into chunks. - + Splits based on the specified split_by, chunk size, and chunk overlap settings. Args: @@ -227,24 +217,28 @@ def split_text(self, text: str) -> List[str]: Returns: List[str]: A list of text chunks. """ - log.info(f"Splitting text with split_by: {self.split_by}, chunk_size: {self.chunk_size}, chunk_overlap: {self.chunk_overlap}") + log.info( + f"Splitting text with split_by: {self.split_by}, chunk_size: {self.chunk_size}, chunk_overlap: {self.chunk_overlap}" + ) separator = SEPARATORS[self.split_by] - splits = self._split_text(text, separator) + splits = self._split_text_into_units(text, separator) log.info(f"Text split into {len(splits)} parts.") - chunks = self._concatenate_splits(splits, self.chunk_size, self.chunk_overlap, separator) - log.info(f"Text concatenated into {len(chunks)} chunks.") + chunks = self._merge_units_to_chunks( + splits, self.chunk_size, self.chunk_overlap, separator + ) + log.info(f"Text merged into {len(chunks)} chunks.") return chunks def call(self, documents: DocumentSplitterInputType) -> DocumentSplitterOutputType: """ Process the splitting task on a list of documents in batch. - + Batch processes a list of documents, splitting each document's text according to the configured split_by, chunk size, and chunk overlap. Args: documents (List[Document]): A list of Document objects to process. - + Returns: List[Document]: A list of new Document objects, each containing a chunk of text from the original documents. @@ -252,20 +246,29 @@ def call(self, documents: DocumentSplitterInputType) -> DocumentSplitterOutputTy TypeError: If 'documents' is not a list or contains non-Document objects. ValueError: If any document's text is None. """ - - if not isinstance(documents, list) or any(not isinstance(doc, Document) for doc in documents): + + if not isinstance(documents, list) or any( + not isinstance(doc, Document) for doc in documents + ): log.error("Input should be a list of Documents.") raise TypeError("Input should be a list of Documents.") - + split_docs = [] # Using range and batch_size to create batches - for start_idx in tqdm(range(0, len(documents), self.batch_size), desc="Splitting Documents in Batches"): - batch_docs = documents[start_idx:start_idx + self.batch_size] - + for start_idx in tqdm( + range(0, len(documents), self.batch_size), + desc="Splitting Documents in Batches", + ): + batch_docs = documents[start_idx : start_idx + self.batch_size] + for doc in batch_docs: if not isinstance(doc, Document): - log.error(f"Each item in documents should be an instance of Document, but got {type(doc).__name__}.") - raise TypeError(f"Each item in documents should be an instance of Document, but got {type(doc).__name__}.") + log.error( + f"Each item in documents should be an instance of Document, but got {type(doc).__name__}." + ) + raise TypeError( + f"Each item in documents should be an instance of Document, but got {type(doc).__name__}." + ) if doc.text is None: log.error(f"Text should not be None. Doc id: {doc.id}") @@ -274,61 +277,77 @@ def call(self, documents: DocumentSplitterInputType) -> DocumentSplitterOutputTy text_splits = self.split_text(doc.text) meta_data = deepcopy(doc.meta_data) - split_docs.extend([ - Document( - text=txt, - meta_data=meta_data, - parent_doc_id=f"{doc.id}", - order=i, - vector=[], - ) - for i, txt in enumerate(text_splits) - ]) - log.info(f"Processed {len(documents)} documents into {len(split_docs)} split documents.") + split_docs.extend( + [ + Document( + text=txt, + meta_data=meta_data, + parent_doc_id=f"{doc.id}", + order=i, + vector=[], + ) + for i, txt in enumerate(text_splits) + ] + ) + log.info( + f"Processed {len(documents)} documents into {len(split_docs)} split documents." + ) return split_docs - - def _split_text( - self, text: str, separator: str) -> List[str]: + + def _split_text_into_units(self, text: str, separator: str) -> List[str]: """Split text based on the specified separator.""" if self.split_by == "token": - splits = split_text_tokenized(text) + splits = tokenizer.encode(text) else: splits = text.split(separator) - log.info(f"Text split by '{separator}' into {len(splits)} parts.") + log.info(f"Text split by '{separator}' into {len(splits)} parts.") return splits - - def _concatenate_splits( + + def _merge_units_to_chunks( self, splits: List[str], chunk_size: int, chunk_overlap: int, separator: str ) -> List[str]: """ - Concatenates split text chunks based on the specified chunk size and overlap. + Merge split text chunks based on the specified chunk size and overlap. """ chunks = [] - # we use a window to get the text for each trunk, the window size is chunk_size, step is chunk_size - chunk_overlap + # we use a window to get the text for each trunk, the window size is chunk_size, step is chunk_size - chunk_overlap step = chunk_size - chunk_overlap idx = 0 - + for idx in range(0, len(splits), step): # 1. if the window exceeds the list of splitted string, break and process the last chunk # 2. if the window ends exactly the same with the splits, then break and treat the splits[idx:len(splits)] as the last chunk - if idx+chunk_size >= len(splits): + if idx + chunk_size >= len(splits): break - current_splits = splits[idx:idx+chunk_size] - # add the separator between each unit and concatenate the string + current_splits = splits[idx : idx + chunk_size] + # add the separator between each unit and merge the string # this won't be the last chunk, so we need to add the separator at the end - chunk = separator.join(current_splits) + separator + if self.split_by == "token": + chunk = current_splits # if token, then keep the original form + else: + chunk = separator.join(current_splits) + separator chunks.append(chunk) - + if idx < len(splits): - last_chunk = separator.join(splits[idx:]) + if self.split_by == "token": + last_chunk = splits[idx:] # if token, then keep the original form + else: + last_chunk = separator.join( + splits[idx:] + ) # if not token, then join into string if len(last_chunk) > 0: chunks.append(last_chunk) - log.info(f"Concatenated into {len(chunks)} chunks.") + + if self.split_by == "token": + # decode each chunk here + chunks = [tokenizer.decode(chunk) for chunk in chunks] + + log.info(f"Merged into {len(chunks)} chunks.") return chunks - + def _extra_repr(self) -> str: s = f"split_by={self.split_by}, chunk_size={self.chunk_size}, chunk_overlap={self.chunk_overlap}" return s - - -# test the execution llamaindex and langchain \ No newline at end of file + + +# test the execution llamaindex and langchain diff --git a/lightrag/database/sqlalchemy/__init__.py b/lightrag/lightrag/components/memory/__init__.py similarity index 100% rename from lightrag/database/sqlalchemy/__init__.py rename to lightrag/lightrag/components/memory/__init__.py diff --git a/lightrag/lightrag/components/memory/memory.py b/lightrag/lightrag/components/memory/memory.py new file mode 100644 index 00000000..d02647d2 --- /dev/null +++ b/lightrag/lightrag/components/memory/memory.py @@ -0,0 +1,23 @@ +"""Memory for user-assistant conversations. [Not completed] + +Memory can include data modeling, in-memory data storage, local file data storage, cloud data persistence, data pipeline, data retriever. +It is itself an LLM application and different use cases can do it differently. + + +This implementation covers the minimal and local memory experience for the user-assistant conversation. +""" + +from lightrag.core.types import ( + Conversation, +) + +from lightrag.core.db import LocalDB +from lightrag.core.component import Component + + +class Memory(Component): + def __init__(self, turn_db: LocalDB = None): + super().__init__() + self.current_convesation = Conversation() + self.turn_db = turn_db or LocalDB() # all turns + self.conver_db = LocalDB() # a list of conversations diff --git a/lightrag/components/model_client/__init__.py b/lightrag/lightrag/components/model_client/__init__.py similarity index 91% rename from lightrag/components/model_client/__init__.py rename to lightrag/lightrag/components/model_client/__init__.py index 6667e159..5d8c4413 100644 --- a/lightrag/components/model_client/__init__.py +++ b/lightrag/lightrag/components/model_client/__init__.py @@ -15,6 +15,10 @@ "lightrag.components.model_client.transformers_client.TransformerEmbedder", OptionalPackages.TRANSFORMERS, ) +TransformerLLM = LazyImport( + "lightrag.components.model_client.transformers_client.TransformerLLM", + OptionalPackages.TRANSFORMERS, +) TransformersClient = LazyImport( "lightrag.components.model_client.transformers_client.TransformersClient", OptionalPackages.TRANSFORMERS, @@ -49,6 +53,7 @@ "CohereAPIClient", "TransformerReranker", "TransformerEmbedder", + "TransformerLLM", "TransformersClient", "AnthropicAPIClient", "GroqAPIClient", diff --git a/lightrag/components/model_client/anthropic_client.py b/lightrag/lightrag/components/model_client/anthropic_client.py similarity index 100% rename from lightrag/components/model_client/anthropic_client.py rename to lightrag/lightrag/components/model_client/anthropic_client.py diff --git a/lightrag/components/model_client/cohere_client.py b/lightrag/lightrag/components/model_client/cohere_client.py similarity index 100% rename from lightrag/components/model_client/cohere_client.py rename to lightrag/lightrag/components/model_client/cohere_client.py diff --git a/lightrag/components/model_client/google_client.py b/lightrag/lightrag/components/model_client/google_client.py similarity index 98% rename from lightrag/components/model_client/google_client.py rename to lightrag/lightrag/components/model_client/google_client.py index 2c53fba9..ccc89065 100644 --- a/lightrag/components/model_client/google_client.py +++ b/lightrag/lightrag/components/model_client/google_client.py @@ -11,6 +11,7 @@ BadRequest, GoogleAPICallError, ) +from google.generativeai.types import GenerateContentResponse from lightrag.core.model_client import ModelClient diff --git a/lightrag/components/model_client/groq_client.py b/lightrag/lightrag/components/model_client/groq_client.py similarity index 100% rename from lightrag/components/model_client/groq_client.py rename to lightrag/lightrag/components/model_client/groq_client.py diff --git a/lightrag/components/model_client/openai_client.py b/lightrag/lightrag/components/model_client/openai_client.py similarity index 99% rename from lightrag/components/model_client/openai_client.py rename to lightrag/lightrag/components/model_client/openai_client.py index 30593bfa..e9d6e76f 100644 --- a/lightrag/components/model_client/openai_client.py +++ b/lightrag/lightrag/components/model_client/openai_client.py @@ -116,8 +116,6 @@ def init_async_client(self): raise ValueError("Environment variable OPENAI_API_KEY must be set") return AsyncOpenAI(api_key=api_key) - # save raw response - def parse_chat_completion(self, completion: Completion) -> Any: """Parse the completion to a str.""" log.debug(f"completion: {completion}") diff --git a/lightrag/components/model_client/transformers_client.py b/lightrag/lightrag/components/model_client/transformers_client.py similarity index 68% rename from lightrag/components/model_client/transformers_client.py rename to lightrag/lightrag/components/model_client/transformers_client.py index cf9aeba5..1d9f72d2 100644 --- a/lightrag/components/model_client/transformers_client.py +++ b/lightrag/lightrag/components/model_client/transformers_client.py @@ -13,6 +13,7 @@ AutoTokenizer, AutoModel, AutoModelForSequenceClassification, + AutoModelForCausalLM, ) from lightrag.core.model_client import ModelClient @@ -223,6 +224,106 @@ def __call__(self, **kwargs): raise ValueError(f"model {model_name} is not supported") +class TransformerLLM: + models: Dict[str, type] = {} + + def __init__(self, model_name: Optional[str] = "HuggingFaceH4/zephyr-7b-beta"): + super().__init__() + + if model_name is not None: + self.init_model(model_name=model_name) + + def init_model(self, model_name: str): + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForCausalLM.from_pretrained(model_name) + # register the model + self.models[model_name] = self.model + self.device = "cuda" if torch.cuda.is_available() else "cpu" + log.info(f"Done loading model {model_name}") + # Set pad token if it's not already set + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token # common fallback + self.model.config.pad_token_id = ( + self.tokenizer.eos_token_id + ) # ensure consistency in the model config + except Exception as e: + log.error(f"Error loading model {model_name}: {e}") + raise e + + def parse_chat_completion(self, input_text: str, response: str): + parsed_response = response.replace( + input_text, "" + ).strip() # Safely handle cases where input_text might not be in response + + return parsed_response if parsed_response else response + + def call( + self, + input_text: str, + skip_special_tokens: bool = True, + clean_up_tokenization_spaces: bool = False, + max_length: int = 150, + ): + if not self.model: + log.error("Model is not initialized.") + raise ValueError("Model is not initialized.") + + # Ensure tokenizer has pad token; set it if not + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.model.config.pad_token_id = ( + self.tokenizer.eos_token_id + ) # Sync model config pad token id + + # Process inputs with attention mask and padding + inputs = self.tokenizer(input_text, return_tensors="pt", padding=True).to( + self.device + ) + # inputs = self.tokenizer(input_text, return_tensors="pt", padding="longest", truncation=True).to(self.device) + + with torch.no_grad(): # Ensures no gradients are calculated to save memory and computations + generate_ids = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + max_length=max_length, # Control the output length more precisely + ) + response = self.tokenizer.decode( + generate_ids[0], + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + parsed_response = self.parse_chat_completion(input_text, response) + return parsed_response + + def __call__( + self, + input_text: str, + skip_special_tokens: bool = True, + clean_up_tokenization_spaces: bool = False, + max_length: int = 150, + ): + return self.call( + input_text, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + max_length=max_length, + ) + + # def call(self, input_text: str, skip_special_tokens: bool = True, clean_up_tokenization_spaces: bool = False): + # if not self.model: + # log.error("Model is not initialized.") + # raise ValueError("Model is not initialized.") + + # inputs = self.tokenizer(input_text, return_tensors="pt") + # generate_ids = self.model.generate(inputs.input_ids, max_length=30) + # response = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces)[0] + # return response + + # def __call__(self, input_text: str, skip_special_tokens: bool = True, clean_up_tokenization_spaces: bool = False): + # return self.call(input_text, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=clean_up_tokenization_spaces) + + class TransformersClient(ModelClient): __doc__ = r"""LightRAG API client for transformers. @@ -236,6 +337,7 @@ class TransformersClient(ModelClient): "BAAI/bge-reranker-base": { "type": ModelType.RERANKER, }, + "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM}, } def __init__(self, model_name: Optional[str] = None) -> None: @@ -249,6 +351,8 @@ def __init__(self, model_name: Optional[str] = None) -> None: self.sync_client = self.init_sync_client() elif self._model_name == "BAAI/bge-reranker-base": self.reranker_client = self.init_reranker_client() + elif self._model_name == "HuggingFaceH4/zephyr-7b-beta": + self.llm_client = self.init_llm_client() self.async_client = None def init_sync_client(self): @@ -257,6 +361,9 @@ def init_sync_client(self): def init_reranker_client(self): return TransformerReranker() + def init_llm_client(self): + return TransformerLLM() + def parse_embedding_response(self, response: Any) -> EmbedderOutput: embeddings: List[Embedding] = [] for idx, emb in enumerate(response): @@ -289,6 +396,15 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE scores, api_kwargs["top_k"] ) return top_k_indices, top_k_scores + elif ( # LLM + model_type == ModelType.LLM + and "model" in api_kwargs + and api_kwargs["model"] == "HuggingFaceH4/zephyr-7b-beta" + ): + if not hasattr(self, "llm_client") or self.llm_client is None: + self.llm_client = self.init_llm_client() + response = self.llm_client(**api_kwargs) + return response def convert_inputs_to_api_kwargs( self, @@ -306,5 +422,9 @@ def convert_inputs_to_api_kwargs( assert "top_k" in final_model_kwargs, "top_k must be specified" final_model_kwargs["query"] = input return final_model_kwargs + elif model_type == ModelType.LLM: + assert "model" in final_model_kwargs, "model must be specified" + final_model_kwargs["input"] = input + return final_model_kwargs else: - raise ValueError(f"model_type {model_type} is not supported") \ No newline at end of file + raise ValueError(f"model_type {model_type} is not supported") diff --git a/lightrag/components/model_client/utils.py b/lightrag/lightrag/components/model_client/utils.py similarity index 100% rename from lightrag/components/model_client/utils.py rename to lightrag/lightrag/components/model_client/utils.py diff --git a/lightrag/components/output_parsers/__init__.py b/lightrag/lightrag/components/output_parsers/__init__.py similarity index 100% rename from lightrag/components/output_parsers/__init__.py rename to lightrag/lightrag/components/output_parsers/__init__.py diff --git a/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py similarity index 78% rename from lightrag/components/output_parsers/outputs.py rename to lightrag/lightrag/components/output_parsers/outputs.py index 9731f2c5..f36af45c 100644 --- a/lightrag/components/output_parsers/outputs.py +++ b/lightrag/lightrag/components/output_parsers/outputs.py @@ -5,13 +5,15 @@ """ from dataclasses import is_dataclass -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, List import logging from lightrag.core.component import Component from lightrag.core.prompt_builder import Prompt -from lightrag.core.string_parser import YAMLParser, ListParser, JsonParser +from lightrag.core.string_parser import YamlParser, ListParser, JsonParser from lightrag.core.base_data_class import DataClass, DataClassFormatType +from lightrag.core.base_data_class import ExcludeType + # TODO: might be worth to parse a list of yaml or json objects. For instance, a list of jokes. # setup: Why couldn't the bicycle stand up by itself? @@ -35,22 +37,21 @@ {{schema}} ``` {% if example %} -Here is an example: +Examples: ``` {{example}} ``` {% endif %} -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! -Use double quotes for the keys and string values. --Follow the JSON formatting conventions. -""" +-Follow the JSON formatting conventions.""" YAML_OUTPUT_FORMAT = r"""Your output should be formatted as a standard YAML instance with the following schema: ``` {{schema}} ``` {% if example %} -Here is an example: +Examples: ``` {{example}} ``` @@ -58,8 +59,7 @@ -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output! -Follow the YAML formatting conventions with an indent of 2 spaces. --Quote the string values properly. -""" +-Quote the string values properly.""" LIST_OUTPUT_FORMAT = r"""Your output should be formatted as a standard Python list. -Each element can be of any Python data type such as string, integer, float, list, dictionary, etc. @@ -98,11 +98,14 @@ def call(self, input: str) -> Any: class YamlOutputParser(OutputParser): __doc__ = r"""YAML output parser using dataclass for schema extraction. + .. note:: + Only use yaml for simple dataclass objects. For complex objects, use JSON. + Args: data_class (Type): The dataclass to extract the schema for the YAML output. example (Type, optional): The example dataclass object to show in the prompt. Defaults to None. yaml_output_format_template (str, optional): The template for the YAML output format. Defaults to YAML_OUTPUT_FORMAT. - output_processors (Component, optional): The output processors to parse the YAML string to JSON object. Defaults to YAMLParser(). + output_processors (Component, optional): The output processors to parse the YAML string to JSON object. Defaults to YamlParser(). Examples: @@ -139,9 +142,8 @@ class YamlOutputParser(OutputParser): def __init__( self, data_class: DataClass, - example: DataClass = None, - template: Optional[str] = None, - output_processors: Optional[Component] = None, + examples: List[DataClass] = None, + exclude_fields: ExcludeType = None, ): super().__init__() @@ -149,18 +151,19 @@ def __init__( raise ValueError(f"Provided class is not a dataclass: {data_class}") # ensure example is instance of data class and initiated - if example is not None and not isinstance(example, data_class): + if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) - template = template or YAML_OUTPUT_FORMAT + self._exclude_fields = exclude_fields self.data_class_for_yaml = data_class - self.yaml_output_format_prompt = Prompt(template=template) - self.output_processors = output_processors or YAMLParser() - self.example = example + self.yaml_output_format_prompt = Prompt(template=YAML_OUTPUT_FORMAT) + self.output_processors = YamlParser() + self.examples = examples def format_instructions( - self, format_type: Optional[DataClassFormatType] = None + self, + format_type: Optional[DataClassFormatType] = None, ) -> str: r"""Return the formatted instructions to use in prompt for the YAML output format. @@ -168,14 +171,23 @@ def format_instructions( format_type (DataClassFormatType, optional): The format type to show in the prompt. Defaults to DataClassFormatType.SIGNATURE_YAML for less token usage. Options: DataClassFormatType.SIGNATURE_YAML, DataClassFormatType.SIGNATURE_JSON, DataClassFormatType.SCHEMA. + exclude (List[str], optional): The fields to exclude from the schema of the data class. """ format_type = format_type or DataClassFormatType.SIGNATURE_YAML - schema = self.data_class_for_yaml.format_class_str(format_type=format_type) + schema = self.data_class_for_yaml.format_class_str( + format_type=format_type, exclude=self._exclude_fields + ) # convert example to string, convert data class to yaml string + example_str = "" try: - example_str = self.example.format_example_str( - format_type=DataClassFormatType.EXAMPLE_YAML - ) + for example in self.examples: + per_example_str = example.format_example_str( + format_type=DataClassFormatType.EXAMPLE_YAML, + exclude=self._exclude_fields, + ) + example_str += f"{per_example_str}\n________\n" + # remove the last new line + example_str = example_str[:-1] log.debug(f"{__class__.__name__} example_str: {example_str}") except Exception: @@ -188,7 +200,7 @@ def call(self, input: str) -> YAML_OUTPUT_PARSER_OUTPUT_TYPE: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_yaml={self.data_class_for_yaml}" + s = f"data_class_for_yaml={self.data_class_for_yaml}, examples={self.examples}" return s @@ -196,26 +208,28 @@ class JsonOutputParser(OutputParser): def __init__( self, data_class: DataClass, - example: DataClass = None, - template: Optional[str] = None, - output_processors: Optional[Component] = None, + examples: List[DataClass] = None, + exclude_fields: ExcludeType = None, ): super().__init__() if not is_dataclass(data_class): raise ValueError(f"Provided class is not a dataclass: {data_class}") - if example is not None and not isinstance(example, data_class): + if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) - template = template or JSON_OUTPUT_FORMAT + self._exclude_fields = exclude_fields + template = JSON_OUTPUT_FORMAT self.data_class_for_json = data_class self.json_output_format_prompt = Prompt(template=template) - self.output_processors = output_processors or JsonParser() - self.example = example + self.output_processors = JsonParser() + self.examples = examples + # TODO: make exclude works with both def format_instructions( - self, format_type: Optional[DataClassFormatType] = None + self, + format_type: Optional[DataClassFormatType] = None, ) -> str: r"""Return the formatted instructions to use in prompt for the JSON output format. @@ -225,11 +239,19 @@ def format_instructions( Options: DataClassFormatType.SIGNATURE_YAML, DataClassFormatType.SIGNATURE_JSON, DataClassFormatType.SCHEMA. """ format_type = format_type or DataClassFormatType.SIGNATURE_JSON - schema = self.data_class_for_json.format_class_str(format_type=format_type) + schema = self.data_class_for_json.format_class_str( + format_type=format_type, exclude=self._exclude_fields + ) + example_str = "" try: - example_str = self.example.format_example_str( - format_type=DataClassFormatType.EXAMPLE_JSON - ) + for example in self.examples: + per_example_str = example.format_example_str( + format_type=DataClassFormatType.EXAMPLE_JSON, + exclude=self._exclude_fields, + ) + example_str += f"{per_example_str}\n________\n" + # remove the last new line + example_str = example_str[:-1] log.debug(f"{__class__.__name__} example_str: {example_str}") except Exception: @@ -240,7 +262,7 @@ def call(self, input: str) -> Any: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_json={self.data_class_for_json}" + s = f"data_class_for_json={self.data_class_for_json}, examples={self.examples}, exclude_fields={self._exclude_fields}" return s diff --git a/lightrag/lightrag/components/reasoning/__init__.py b/lightrag/lightrag/components/reasoning/__init__.py new file mode 100644 index 00000000..f9340a77 --- /dev/null +++ b/lightrag/lightrag/components/reasoning/__init__.py @@ -0,0 +1 @@ +from .chain_of_thought import * # noqa: F401, F403 diff --git a/lightrag/lightrag/components/reasoning/chain_of_thought.py b/lightrag/lightrag/components/reasoning/chain_of_thought.py new file mode 100644 index 00000000..699975e2 --- /dev/null +++ b/lightrag/lightrag/components/reasoning/chain_of_thought.py @@ -0,0 +1,96 @@ +""" +https://arxiv.org/abs/2201.11903, published in Jan, 2023 + +Chain of the thought(CoT) is to mimic a step-by-step thought process for arriving at the answer. You can achieve it in two ways: +1. Add instructions such as "Let's think step-by-step to answer this question". +2. Add few-shot examples such as +' +Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now? +A: Roger started with 5 balls. 2 cansof 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11. +' + +NOTE: CoT can be helpful for more complicated task, it also varies from task to task and model to model. +For instance, CoT might already be supported in gpt3.5+ api calls. + +Benchmark it with and without CoT to see if it helps. +""" + +# from core.component import Component +# from core.generator import Generator +# from core.string_parser import JsonParser +# from core.model_client import ModelClient +# from core.default_prompt_template import DEFAULT_LIGHTRAG_SYSTEM_PROMPT + + +COT_TASK_DESC_STR_BASIC = ( + "You are a helpful assistant. Let's think step-by-step to answer user's query." +) +# Using triple quotes to include JSON-like structure more cleanly +COT_TASK_DESC_STR_WITH_JSON_OUTPUT = f""" +{COT_TASK_DESC_STR_BASIC} Output JSON format: {{"thought": "", "answer": ""}} +""" + + +# ChainOfThought will just be a generator with preset_prompt_kwargs of the task_desc_str = COT_TASK_DESC_STR +# additional you can ask it to generate a json with "thought" and "anwer" keys and use jsonParser + + +# class CoTGenerator(Generator): +# r""" +# CoTGenerator is a subclass of Generator with default task_desc_str preset for Chain of Thought. +# Output will be string. +# It is exactly the same as using a Generator. +# Example: +# ``` +# cot = CoTGenerator(model_client=model_client, model_kwargs={"model": model}) +# ``` +# """ + +# def __init__( +# self, +# *, +# model_client: ModelClient, +# model_kwargs: Dict = {}, +# template: Optional[str] = None, +# preset_prompt_kwargs: Optional[Dict] = None, +# output_processors: Optional[Component] = None, +# ) -> None: + +# super().__init__( +# model_client=model_client, +# model_kwargs=model_kwargs, +# template=template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT, +# preset_prompt_kwargs=preset_prompt_kwargs +# or {"task_desc_str": COT_TASK_DESC_STR_BASIC}, +# output_processors=output_processors, +# ) + + +# class CoTGeneratorWithJsonOutput(Generator): +# r""" +# CoTGeneratorWithJsonOutput is a subclass of Generator with default task_desc_str preset for Chain of Thought. +# Output will be parsed as JSON with "thought" and "answer" keys. +# Example: +# ``` +# cot = CoTGeneratorWithJsonOutput(model_client=model_client, model_kwargs={"model": model}) +# ``` +# """ + +# def __init__( +# self, +# *, +# model_client: ModelClient, +# model_kwargs: Dict = {}, +# template: Optional[str] = None, +# preset_prompt_kwargs: Optional[Dict] = None, +# output_processors: Optional[Component] = None, +# ) -> None: + +# super().__init__( +# model_client=model_client, +# model_kwargs=model_kwargs, +# template=template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT, +# preset_prompt_kwargs=preset_prompt_kwargs +# or {"task_desc_str": COT_TASK_DESC_STR_WITH_JSON_OUTPUT}, +# output_processors=output_processors or JsonParser(), +# ) diff --git a/lightrag/components/retriever/__init__.py b/lightrag/lightrag/components/retriever/__init__.py similarity index 94% rename from lightrag/components/retriever/__init__.py rename to lightrag/lightrag/components/retriever/__init__.py index a784d2bb..fb51c68e 100644 --- a/lightrag/components/retriever/__init__.py +++ b/lightrag/lightrag/components/retriever/__init__.py @@ -1,5 +1,5 @@ from .bm25_retriever import ( - InMemoryBM25Retriever, + BM25Retriever, split_text_by_word_fn, split_text_by_word_fn_then_lower_tokenized, ) @@ -23,7 +23,7 @@ ) __all__ = [ - "InMemoryBM25Retriever", + "BM25Retriever", "LLMRetriever", "FAISSRetriever", "RerankerRetriever", diff --git a/lightrag/components/retriever/bm25_retriever.py b/lightrag/lightrag/components/retriever/bm25_retriever.py similarity index 97% rename from lightrag/components/retriever/bm25_retriever.py rename to lightrag/lightrag/components/retriever/bm25_retriever.py index 044c73a9..6e2bc79a 100644 --- a/lightrag/components/retriever/bm25_retriever.py +++ b/lightrag/lightrag/components/retriever/bm25_retriever.py @@ -42,6 +42,7 @@ def split_text_by_word_fn_then_lower_tokenized(x: str) -> List[str]: final_tokens.append(tokenizer.decode([token])) return final_tokens + def split_text_tokenized(x: str) -> List[str]: tokenizer = Tokenizer() # words = x.lower().split(" ") @@ -54,7 +55,7 @@ def split_text_tokenized(x: str) -> List[str]: return final_tokens -class InMemoryBM25Retriever(Retriever[str, RetrieverStrQueryType]): +class BM25Retriever(Retriever[str, RetrieverStrQueryType]): __doc__ = r"""Fast Implementation of Best Matching 25 ranking function. It expects str as the final document type after ``document_map_func`` if the given document is not already in the format of List[str]. @@ -100,7 +101,7 @@ class InMemoryBM25Retriever(Retriever[str, RetrieverStrQueryType]): .. code-block:: python - from lightrag.components.retriever.bm25_retriever import InMemoryBM25Retriever + from lightrag.components.retriever.bm25_retriever import BM25Retriever documents = ["hello world", "world is beautiful", "today is a good day"] @@ -108,7 +109,7 @@ class InMemoryBM25Retriever(Retriever[str, RetrieverStrQueryType]): .. code-block:: python - retriever = InMemoryBM25Retriever(top_k=1, documents=documents) + retriever = BM25Retriever(top_k=1, documents=documents) output = retriever("hello") print(output) # Output: @@ -117,8 +118,7 @@ class InMemoryBM25Retriever(Retriever[str, RetrieverStrQueryType]): 2. Pass the documents from the :meth:`build_index_from_documents` method: .. code-block:: python - - retriever = InMemoryBM25Retriever(top_k=1) + retriever = BM25Retriever(top_k=1) retriever.build_index_from_documents(documents) output = retriever("hello") @@ -127,7 +127,7 @@ class InMemoryBM25Retriever(Retriever[str, RetrieverStrQueryType]): .. code-block:: python retriever.save_to_file("bm25_index.json") - retriever2 = InMemoryBM25Retriever.load_from_file("bm25_index.json") + retriever2 = BM25Retriever.load_from_file("bm25_index.json") output = retriever2("hello") print(output) diff --git a/lightrag/components/retriever/faiss_retriever.py b/lightrag/lightrag/components/retriever/faiss_retriever.py similarity index 100% rename from lightrag/components/retriever/faiss_retriever.py rename to lightrag/lightrag/components/retriever/faiss_retriever.py diff --git a/lightrag/components/retriever/llm_retriever.py b/lightrag/lightrag/components/retriever/llm_retriever.py similarity index 100% rename from lightrag/components/retriever/llm_retriever.py rename to lightrag/lightrag/components/retriever/llm_retriever.py diff --git a/lightrag/components/retriever/postgres_retriever.py b/lightrag/lightrag/components/retriever/postgres_retriever.py similarity index 100% rename from lightrag/components/retriever/postgres_retriever.py rename to lightrag/lightrag/components/retriever/postgres_retriever.py diff --git a/lightrag/components/retriever/reranker_retriever.py b/lightrag/lightrag/components/retriever/reranker_retriever.py similarity index 100% rename from lightrag/components/retriever/reranker_retriever.py rename to lightrag/lightrag/components/retriever/reranker_retriever.py diff --git a/lightrag/core/README.md b/lightrag/lightrag/core/README.md similarity index 100% rename from lightrag/core/README.md rename to lightrag/lightrag/core/README.md diff --git a/lightrag/core/__init__.py b/lightrag/lightrag/core/__init__.py similarity index 89% rename from lightrag/core/__init__.py rename to lightrag/lightrag/core/__init__.py index 5bd17f04..485c410f 100644 --- a/lightrag/core/__init__.py +++ b/lightrag/lightrag/core/__init__.py @@ -1,12 +1,17 @@ -from .component import Component, Sequential, FunComponent, fun_to_component -from .parameter import Parameter -from .model_client import ModelClient from .base_data_class import DataClass, required_field, DataClassFormatType +from .component import Component, Sequential, FunComponent, fun_to_component +from .db import LocalDB +from .default_prompt_template import DEFAULT_LIGHTRAG_SYSTEM_PROMPT from .embedder import Embedder, BatchEmbedder +from .generator import Generator +from .model_client import ModelClient +from .parameter import Parameter +from .prompt_builder import Prompt + from .retriever import Retriever +from .tokenizer import Tokenizer -from .generator import Generator from .types import ( ModelType, ModelClientType, @@ -34,12 +39,12 @@ UserQuery, AssistantResponse, DialogTurn, - DialogSession, + Conversation, ) -from .prompt_builder import Prompt from lightrag.utils.registry import EntityMapping __all__ = [ + "LocalDB", "Component", "Sequential", "FunComponent", @@ -49,6 +54,7 @@ "required_field", "Generator", "Prompt", + "DEFAULT_LIGHTRAG_SYSTEM_PROMPT", "Parameter", "required_field", "ModelClient", @@ -81,7 +87,8 @@ "UserQuery", "AssistantResponse", "DialogTurn", - "DialogSession", + "Conversation", + "Tokenizer", ] for name in __all__: diff --git a/lightrag/lightrag/core/base_data_class.py b/lightrag/lightrag/core/base_data_class.py new file mode 100644 index 00000000..6cbd33b2 --- /dev/null +++ b/lightrag/lightrag/core/base_data_class.py @@ -0,0 +1,528 @@ +""" +The role of the base data class in LightRAG for LLM applications is like `Tensor` for `PyTorch`. +""" + +from typing import List, Dict, Any, Optional, Union, Callable +import collections + +import enum +from copy import deepcopy +from dataclasses import ( + dataclass, + field, + fields, + make_dataclass, + is_dataclass, +) + +import json +import yaml +import warnings +import logging + +from lightrag.core.functional import ( + # dataclass_obj_to_dict, + custom_asdict, + dataclass_obj_from_dict, + get_dataclass_schema, + convert_schema_to_signature, + represent_ordereddict, +) + + +logger = logging.getLogger(__name__) + + +class DataClassFormatType(enum.Enum): + r"""The format type for the DataClass schema.""" + + # for class + SCHEMA = "schema" + SIGNATURE_YAML = "signature_yaml" + SIGNATURE_JSON = "signature_json" + # for instance + EXAMPLE_YAML = "example_yaml" + EXAMPLE_JSON = "example_json" + + +# Register the custom representer +yaml.add_representer(collections.OrderedDict, represent_ordereddict) + + +def required_field() -> Callable[[], Any]: + """ + A factory function to create a required field in a dataclass. + The returned callable raises a TypeError when invoked, indicating a required field was not provided. + + Args: + name (Optional[str], optional): The name of the required field. Defaults to None + + Returns: + Callable[[], Any]: A callable that raises TypeError when called, indicating a missing required field. + + Example: + + .. code-block:: python + + from dataclasses import dataclass + from lightrag.core.base_data_class import required_field, DataClass + + @dataclass + class Person(DataClass): + name: str = field(default=None) + age: int = field(default_factory=required_field())# allow required field after optional field + """ + + def required_field_error(): + """This function is returned by required_field and raises an error indicating the field is required.""" + raise TypeError("This field is required and was not provided.") + + required_field_error.__name__ = ( + "required_field" # Set the function's name explicitly + ) + return required_field_error + + +# Dict is for the nested dataclasses, e.g. {"Person": ["name", "age"], "Address": ["city"]} +ExcludeType = Optional[Union[List[str], Dict[str, List[str]]]] + + +class DataClass: + __doc__ = r"""The base data class for all data types that interact with LLMs. + + Please only exclude optional fields in the exclude dictionary. + + Designed to streamline the handling, serialization, and description of data within our applications, especially to LLM prompt. + We explicitly handle this instead of relying on 3rd party libraries such as pydantic or marshmallow to have better + transparency and to keep the order of the fields when get serialized. + + How to create your own dataclass? + + 1. Subclass DataClass and define the fields with the `field` decorator. + 2. Use the `medata` argument and a `desc` key to describe the field. + 3. Keep the order of the fields as how you want them to be serialized and described to LLMs. + 4. field with default value is considered optional. Field without default value and field with default_factory=required_field is considered required. + + How to use it? + + Describing: + + We defined :ref:`DataClassFormatType ` to categorize DataClass description formats + as input or output in LLM prompt. + + + (1) For describing the class (data structure): + + `Signature` is more token effcient than schema, and schema as it is always a json string, when you want LLMs to output yaml, it can be misleading if you describe the data structure in json. + + - DataClassFormatType.SCHEMA: a more standard way to describe the data structure in Json string, :meth:`to_schema` as string and :meth:`to_schema` as dict. + - DataClassFormatType.SIGNATURE_JSON: emitating a json object with field name as key and description as value, :meth:`to_json_signature` as string. + - DataClassFormatType.SIGNATURE_YAML: emitating a yaml object with field name as key and description as value, :meth:`to_yaml_signature` as string. + + (2) For describing the class instance: this is helpful to do few-shot examples in LLM prompt. + - DataClassFormatType.EXAMPLE_JSON: the json representation of the instance, :meth:`to_json` as string. + - DataClassFormatType.EXAMPLE_YAML: the yaml representation of the instance, :meth:`to_yaml` as string. + + Overall, we have a unified class method :meth:`format_str` to generate formatted output based on the type of operation and class/instance context. + + note:: + You do not need to use our format, overwrite any method in the subclass to fit in your needs. + + Loading data: + + - :meth:`from_dict` is used to create a dataclass instance from a dictionary. + + + Refer :ref:`DataClass` for more detailed instructions. + + Examples: + + .. code-block:: python + + # Define a dataclass + from lightrag.core import DataClass + from dataclasses import dataclass, field + + @dataclass + class MyOutputs(DataClass): + age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"}) + name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"}) + + # Create json signature + print(MyOutputs.to_json_signature()) + # Output: + # { + # "age": "The age of the person", + # "name": "The name of the person" + # } + # Create yaml signature + print(MyOutputs.to_yaml_signature()) + # Output: + # age: The age of the person + # name: The name of the person + + # Create a dataclass instance + my_instance = MyOutputs(age=25, name="John Doe") + # Create json example + print(my_instance.to_json_example()) + # Output: + # { + # "age": 25, + # "name": "John Doe" + # } + # Create yaml signature + print(my_instance.to_yaml_example()) + # Output: + # age: 25 + # name: John Doe + + """ + + def __post_init__(self): + + for f in fields(self): + if "desc" not in f.metadata and "description" not in f.metadata: + warnings.warn( + f"Class { self.__class__.__name__} Field {f.name} is missing 'desc' in metadata", + UserWarning, + ) + + def to_dict(self, exclude: ExcludeType = None) -> Dict[str, Any]: + """Convert a dataclass object to a dictionary. + + Supports nested dataclasses, lists, and dictionaries. + Allow exclude keys for each dataclass object. + + Use cases: + - Decide what information will be included to be serialized to JSON or YAML that can be used in LLM prompt. + - Exclude sensitive information from the serialized output. + - Serialize the dataclass instance to a dictionary for saving states. + + Args: + exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None. + + + Example: + + .. code-block:: python + + from dataclasses import dataclass + from typing import List + + @dataclass + class TrecData: + question: str + label: int + + @dataclass + class TrecDataList(DataClass): + + data: List[TrecData] + name: str + + trec_data = TrecData(question="What is the capital of France?", label=0) + trec_data_list = TrecDataList(data=[trec_data], name="trec_data_list") + + trec_data_list.to_dict(exclude={"TrecData": ["label"], "TrecDataList": ["name"]}) + + # Output: + # {'data': [{'question': 'What is the capital of France?'}]} + """ + if not is_dataclass(self): + raise ValueError("to_dict() called on a class type, not an instance.") + excluded: Optional[Dict[str, List[str]]] = None + if exclude and isinstance(exclude, List): + excluded = {self.__class__.__name__: exclude} + elif exclude and isinstance(exclude, Dict): + excluded = deepcopy(exclude) + else: + excluded = None + return custom_asdict(self, exclude=excluded) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DataClass": + """Create a dataclass instance from a dictionary. + + Supports nested dataclasses, lists, and dictionaries. + + Example from the :meth:`to_dict` method. + + ..code-block:: python + + data_dict = trec_data_list.to_dict() + restored_data = TreDataList.from_dict(data_dict) + + assert str(restored_data.__dict__) == str(trec_data_list.__dict__) + + .. note:: + If any required field is missing, it will raise an error. + Do not use the dict that has excluded required fields. + + Use cases: + - Convert the json/yaml output from LLM prediction to a dataclass instance. + - Restore the dataclass instance from the serialized output used for states saving. + """ + return dataclass_obj_from_dict(cls, data) + + @classmethod + def from_json(cls, json_str: str) -> "DataClass": + """Create a dataclass instance from a JSON string. + + Args: + json_str (str): The JSON string to convert to a dataclass instance. + + Example: + + .. code-block:: python + + json_str = '{"question": "What is the capital of France?", "label": 0}' + trec_data = TrecData.from_json(json_str) + """ + try: + data = json.loads(json_str) + return cls.from_dict(data) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to load JSON string: {e}") + + def to_json_obj(self, exclude: ExcludeType = None) -> Any: + r"""Convert the dataclass instance to a JSON object. + + :meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained. + This can be important to llm prompt. + + Args: + + exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None. + """ + return json.loads(self.to_json(exclude)) + + def to_json(self, exclude: ExcludeType = None) -> str: + r"""Convert the dataclass instance to a JSON string. + + :meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained. + This can be important to llm prompt. + + Args: + + exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None. + """ + return json.dumps(self.to_dict(exclude), indent=4, sort_keys=False) + + @classmethod + def from_yaml(cls, yaml_str: str) -> "DataClass": + """Create a dataclass instance from a YAML string. + + Args: + yaml_str (str): The YAML string to convert to a dataclass instance. + + Example: + + .. code-block:: python + + yaml_str = 'question: What is the capital of France?\nlabel: 0' + trec_data = TrecData.from_yaml(yaml_str) + """ + try: + data = yaml.safe_load(yaml_str) + return cls.from_dict(data) + except yaml.YAMLError as e: + raise ValueError(f"Failed to load YAML string: {e}") + + def to_yaml_obj(self, exclude: ExcludeType = None) -> Any: + r"""Convert the dataclass instance to a YAML object. + + :meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained. + + Args: + + exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None. + """ + return yaml.safe_load(self.to_yaml(exclude)) + + def to_yaml(self, exclude: ExcludeType = None) -> str: + r"""Convert the dataclass instance to a YAML string. + + :meth:`to_dict` along with the use of sort_keys=False to ensure the order of the fields is maintained. + + Args: + + exclude (Optional[Dict[str, List[str]]], optional): A dictionary of fields to exclude for each dataclass object. Defaults to None. + """ + return yaml.dump( + self.to_dict(exclude), default_flow_style=False, sort_keys=False + ) + + @classmethod + def to_schema(cls, exclude: ExcludeType = None) -> Dict[str, Dict[str, Any]]: + """Generate a Json schema which is more detailed than the signature.""" + # convert exclude to dict if it is a list + excluded: Optional[Dict[str, List[str]]] = None + if exclude and isinstance(exclude, List): + excluded = {cls.__name__: exclude} + elif exclude and isinstance(exclude, Dict): + excluded = deepcopy(exclude) + else: + excluded = None + return get_dataclass_schema(cls, excluded) + + @classmethod + def to_schema_str(cls, exclude: ExcludeType = None) -> str: + """Generate a Json schema which is more detailed than the signature.""" + schema = cls.to_schema(exclude) + return json.dumps(schema, indent=4) + + @classmethod + def to_yaml_signature(cls, exclude: ExcludeType = None) -> str: + r"""Generate a YAML signature for the class from desc in metadata. + + Used mostly as LLM prompt to describe the output data format. + """ + # NOTE: we manually format the yaml string as the yaml.dump will always sort the keys + # Which can impact the final model output + schema = cls.to_schema(exclude) + signature_dict = convert_schema_to_signature(schema) + yaml_content = [] + for key, value in signature_dict.items(): + yaml_content.append(f"{key}: {value}") + + yaml_output = "\n".join(yaml_content) + return yaml_output + + @classmethod + def to_json_signature(cls, exclude: ExcludeType = None) -> str: + """Generate a JSON `signature`(json string) for the class from desc in metadata. + + Used mostly as LLM prompt to describe the output data format. + + Example: + + >>> @dataclass + >>> class MyOutputs(DataClass): + >>> age: int = field(metadata={"desc": "The age of the person", "prefix": "Age:"}) + >>> name: str = field(metadata={"desc": "The name of the person", "prefix": "Name:"}) + + >>> print(MyOutputs.to_json_signature()) + >>> # Output is a JSON string: + >>> # '{ + >>> # "age": "The age of the person (int) (required)", + >>> # "name": "The name of the person (str) (required)" + >>> #}' + """ + schema = cls.to_schema(exclude) + signature_dict = convert_schema_to_signature(schema) + + return json.dumps(signature_dict, indent=4) + + @classmethod + def to_dict_class(cls, exclude: ExcludeType = None) -> Dict[str, Any]: + """More of an internal used class method for serialization. + + Converts the dataclass to a dictionary, optionally excluding specified fields. + Use this to save states of the class in serialization, not advised to use in LLM prompt. + """ + return cls.to_schema(exclude) + + @classmethod + def format_class_str( + cls, + format_type: DataClassFormatType, + exclude: ExcludeType = None, + ) -> str: + """Generate formatted output based on the type of operation and class/instance context. + + Args: + format_type (DataClassFormatType): Specifies the format and type (schema, signature, example). + + Returns: + str: A string representing the formatted output. + + Examples: + + .. code-block:: python + + # Define a dataclass + from lightrag.core import DataClass + + """ + + if format_type == DataClassFormatType.SIGNATURE_JSON: + return cls.to_json_signature(exclude) + elif format_type == DataClassFormatType.SIGNATURE_YAML: + return cls.to_yaml_signature(exclude) + + elif format_type == DataClassFormatType.SCHEMA: + return cls.to_schema_str(exclude) + else: + raise ValueError(f"Unsupported format type: {format_type}") + + def format_example_str( + self, format_type: DataClassFormatType, exclude: ExcludeType = None + ) -> str: + """Generate formatted output based on the type of operation and class/instance context. + + Args: + format_type (DataClassFormatType): Specifies the format and type (schema, signature, example). + + Returns: + str: A string representing the formatted output. + + """ + + # Check the type of format required and whether it's called on an instance or class + if format_type == DataClassFormatType.EXAMPLE_JSON: + return self.to_json(exclude) + elif format_type == DataClassFormatType.EXAMPLE_YAML: + return self.to_yaml(exclude) + else: + raise ValueError(f"Unsupported format type: {format_type}") + + +"""Reserved for Agent to automatically create a dataclass and to manipulate the code""" + + +@dataclass +class DynamicDataClassFactory: + __doc__ = r""" + This class is used to create a dynamic dataclass called `DynamicOutputs` from a dictionary. + The dictionary should have the following structure: + { + "field_name": { + "value": field_value, + "desc": "Field description", + }, + + } + + Examples: + + .. code-block:: python + + data = { + "age": {"value": 30, "desc": "The age of the person"}, + "name": {"value": "John Doe", "desc": "The name of the person"}, + } + + DynamicOutputs = DynamicDataClassFactory.create_from_dict(data) + class_instance = DynamicOutputs() + print(class_instance) + + # Output: + # DataClass(age=30, name='John Doe') + """ + + @staticmethod + def create_from_dict(data: dict, base_class=DataClass, class_name="DynamicOutputs"): + fields_spec = [] + for key, value_dict in data.items(): + field_type = type(value_dict["value"]) + default_value = value_dict["value"] + metadata = { + "desc": value_dict.get("desc", "No description provided"), + } + fields_spec.append( + (key, field_type, field(default=default_value, metadata=metadata)) + ) + + dynamic_class = make_dataclass(class_name, fields_spec, bases=(base_class,)) + + return dynamic_class diff --git a/lightrag/core/component.py b/lightrag/lightrag/core/component.py similarity index 98% rename from lightrag/core/component.py rename to lightrag/lightrag/core/component.py index 10e85c7c..fd6c64fa 100644 --- a/lightrag/core/component.py +++ b/lightrag/lightrag/core/component.py @@ -37,15 +37,15 @@ T = TypeVar("T") -class _IncompatibleKeys( - namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]) -): - def __repr__(self): - if not self.missing_keys and not self.unexpected_keys: - return "" - return super().__repr__() +# class _IncompatibleKeys( +# namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]) +# ): +# def __repr__(self): +# if not self.missing_keys and not self.unexpected_keys: +# return "" +# return super().__repr__() - __str__ = __repr__ +# __str__ = __repr__ def _addindent(s_, numSpaces): @@ -750,9 +750,10 @@ def load(component: Component, local_state_dict: Mapping[str, Any], prefix=""): self.__class__.__name__, "\n\t".join(error_msgs) ) ) - return _IncompatibleKeys( - missing_keys=missing_keys, unexpected_keys=unexpected_keys - ) + return namedtuple("_IncompatibleKeys", ["missing_keys", "unexpected_keys"]) + # return _IncompatibleKeys( + # missing_keys=missing_keys, unexpected_keys=unexpected_keys + # ) # def apply(self: "Component", fn: Callable[["Component", Any], None]) -> None: # r""" diff --git a/lightrag/core/db.py b/lightrag/lightrag/core/db.py similarity index 100% rename from lightrag/core/db.py rename to lightrag/lightrag/core/db.py diff --git a/lightrag/core/default_prompt_template.py b/lightrag/lightrag/core/default_prompt_template.py similarity index 85% rename from lightrag/core/default_prompt_template.py rename to lightrag/lightrag/core/default_prompt_template.py index 3e739aa1..65813ee9 100644 --- a/lightrag/core/default_prompt_template.py +++ b/lightrag/lightrag/core/default_prompt_template.py @@ -16,8 +16,11 @@ "examples_str", ] -DEFAULT_LIGHTRAG_SYSTEM_PROMPT = r""" -{% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %} +SIMPLE_DEFAULT_LIGHTRAG_SYSTEM_PROMPT = r"""{{task_desc_str}} +User: {{input_str}} +You:""" + +DEFAULT_LIGHTRAG_SYSTEM_PROMPT = r"""{% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %} {% endif %} {# task desc #} diff --git a/lightrag/core/embedder.py b/lightrag/lightrag/core/embedder.py similarity index 100% rename from lightrag/core/embedder.py rename to lightrag/lightrag/core/embedder.py diff --git a/lightrag/lightrag/core/func_tool.py b/lightrag/lightrag/core/func_tool.py new file mode 100644 index 00000000..485b845a --- /dev/null +++ b/lightrag/lightrag/core/func_tool.py @@ -0,0 +1,333 @@ +""" +Tool is LLM's extended capability which is one of the core design pattern of Agent. All tools can be wrapped in a FunctionTool class. +This helps to standardize the tool interface and metadata to communicate with the Agent. +""" + +from typing import Any, Optional, Callable, Awaitable, Union +from inspect import iscoroutinefunction +import logging +import asyncio + + +from lightrag.core.types import ( + FunctionDefinition, + FunctionOutput, + Function, +) +from lightrag.core import Component +from lightrag.core.functional import ( + get_fun_schema, +) +from inspect import signature + +AsyncCallable = Callable[..., Awaitable[Any]] + +log = logging.getLogger(__name__) + + +def is_running_in_event_loop() -> bool: + try: + import asyncio + + asyncio.get_running_loop() + return True + except RuntimeError: + return False + + +FunctionType = Union[Callable[..., Any], Awaitable[Callable[..., Any]]] + + +# TODO: improve the support for async functions, similarly a component might be used as a tool +class FunctionTool(Component): + __doc__ = r"""Describing and executing a function via call with arguments. + + + container for a function that orchestrates the function formatting(to LLM), parsing, and execution. + + Function be used by LLM as a tool to achieve a specific task. + + Features: + - Supports both synchronous and asynchronous functions via ``call`` and ``acall``. + - Creates a FunctionDefinition from the function using ``get_fun_schema``. + - Executs the function with arguments. + [You can use Function and FunctionExpression as output format] + + - Please Parses the function call expression[FunctionExpression] into Function (name, args, kwargs). + - call or acall, or use execute to execute the function. + + - via call with args and kwargs. + - via eval without any context or sandboxing. + - via sandboxed execute directionly using ``sandbox_exec``. + + + """ + + def __init__( + self, + fn: FunctionType, + definition: Optional[FunctionDefinition] = None, + ): + super().__init__() + assert fn is not None, "fn must be provided" + + self.fn = fn + self._is_async = iscoroutinefunction(fn) + + self.definition = definition or self._create_fn_definition() + if self._is_async: + log.info(f"FunctionTool: {fn} is async: {self._is_async}") + + def _create_fn_definition(self) -> FunctionDefinition: + name = self.fn.__name__ + docstring = self.fn.__doc__ + description = f"{docstring}" + description = f"{name}{signature(self.fn)}\n{docstring}" + # description = f"{name}{signature(self.fn)}\n{docstring}" + fn_parameters = get_fun_schema(name, self.fn) + return FunctionDefinition( + func_name=name, func_desc=description, func_parameters=fn_parameters + ) + + def call(self, *args: Any, **kwargs: Any) -> FunctionOutput: + r"""Execute the function synchronously. + + Example: + + .. code-block:: python + + import time + def sync_function_1(): + time.sleep(1) + return "Function 1 completed" + + tool_1 = FunctionTool(sync_function_1) + output = tool_1.call() + """ + if self._is_async: + raise ValueError("FunctionTool is asynchronous, use acall instead") + output, error = None, None + try: + output = self.fn(*args, **kwargs) + except Exception as e: + log.error(f"Error at calling {self.fn}: {e}") + # raise ValueError(f"Error: {e}") + error = str(e) + return FunctionOutput( + name=self.definition.func_name, + # raw_input={"args": args, "kwargs": kwargs}, + input=Function(name=self.definition.func_name, args=args, kwargs=kwargs), + output=output, + error=error, + ) + + async def acall(self, *args: Any, **kwargs: Any) -> FunctionOutput: + r"""Execute the function asynchronously. + + Need to be called in an async function or using asyncio.run. + + Example: + + .. code-block:: python + + import asyncio + async def async_function_1(): + await asyncio.sleep(1) # Simulate async work + return "Function 1 completed" + + async def call_async_function(): + tool_1 = FunctionTool(async_function_1) + output = await tool_1.acall() + + asyncio.run(call_async_function()) + """ + if not self._is_async: + raise ValueError("FunctionTool is not asynchronous, use call instead") + output = None + error = None + try: + output = await self.fn(*args, **kwargs) + except Exception as e: + log.error(f"Error at calling {self.fn}: {e}") + error = str(e) + + return FunctionOutput( + name=self.definition.func_name, + input=Function(name=self.definition.func_name, args=args, kwargs=kwargs), + output=output, + error=error, + ) + + def execute(self, *args, **kwargs) -> FunctionOutput: + r"""Execute the function synchronously or asynchronously based on the function type. + + No matter of the function type, you can run the function using both asyncio and without asyncio. + + + Use it with caution as it might block the event loop. + + Example: + + .. code-block:: python + + import asyncio + import time + + async def async_function_1(): + await asyncio.sleep(1) + return "Function 1 completed" + + def sync_function_1(): + time.sleep(1) + return "Function 1 completed" + + async def async_function_2(): + await asyncio.sleep(2) + return "Function 2 completed" + + def sync_function_2(): + time.sleep(2) + return "Function 2 completed" + + async_tool_1 = FunctionTool(async_function_1) + sync_tool_1 = FunctionTool(sync_function_2) + async_tool_2 = FunctionTool(async_function_2) + sync_tool_2 = FunctionTool(sync_function_2) + + def run_sync_and_async_mix_without_wait(): + # both sync and async tool can use execute + # sync tool can also use call + # takes 5 seconds (1+1+2) + overhead + start_time = time.time() + results = [ + async_tool_1.execute(), + sync_tool_1.execute(), + sync_tool_2.call(), + ] + end_time = time.time() + print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}") + return results + + async def run_sync_and_async_mix(): + # both sync and async tool can use execute&to_thread + # async tool can also use acall without to_thread + # takes a bit over 2 seconds max(2) + start_time = time.time() + results = await asyncio.gather( + async_tool_1.execute(), + sync_tool_1.execute(), + async_tool_2.acall(), + ) + end_time = time.time() + print(f"run_sync_and_async_mix time: {end_time - start_time}") + return results + + run_sync_and_async_mix_without_wait() + asyncio.run(run_sync_and_async_mix()) + """ + if self._is_async: + if is_running_in_event_loop(): + result = asyncio.create_task(self.acall(*args, **kwargs)) + else: + result = asyncio.run(self.acall(*args, **kwargs)) + else: + if is_running_in_event_loop(): + result = asyncio.to_thread(self.call, *args, **kwargs) + else: + result = self.call(*args, **kwargs) + # return result + # if is_running_in_event_loop(): # is called in an event loop + # if self._is_async: + # result = asyncio.create_task(self.acall(*args, **kwargs)) + # else: + # result = asyncio.to_thread(self.call, *args, **kwargs) + + # else: + # if self._is_async: + # result = asyncio.run(self.acall(*args, **kwargs)) + # else: + # result = self.call(*args, **kwargs) + # if self._is_async: + + # if is_running_in_event_loop(): + # # future = asyncio.run_coroutine_threadsafe( + # # self.acall(*args, **kwargs), asyncio.get_running_loop() + # # ) + # future = asyncio.create_task(self.acall(*args, **kwargs)) + + # result = asyncio.run(future) + # else: + # result = asyncio.run(self.acall(*args, **kwargs)) + # else: + # result = self.call(*args, **kwargs) + return result + + def __call__(self, *args, **kwargs) -> FunctionOutput: + r"""Execute the function synchronously or asynchronously based on the function type.""" + return self.execute(*args, **kwargs) + + def _extra_repr(self) -> str: + s = f"fn: {self.fn}, async: {self._is_async}, definition: {self.definition}" + return s + + +if __name__ == "__main__": + + import asyncio + import time + + async def async_function_1(): + await asyncio.sleep(1) + return "Function 1 completed" + + def sync_function_1(): + time.sleep(1) + return "Function 1 completed" + + async def async_function_2(): + await asyncio.sleep(2) + return "Function 2 completed" + + def sync_function_2(): + time.sleep(2) + return "Function 2 completed" + + async_tool_1 = FunctionTool(async_function_1) + sync_tool_1 = FunctionTool(sync_function_2) + async_tool_2 = FunctionTool(async_function_2) + sync_tool_2 = FunctionTool(sync_function_2) + + def run_sync_and_async_mix_without_wait(): + # both sync and async tool can use execute + # sync tool can also use call + # takes 5 seconds (1+1+2) + overhead + start_time = time.time() + results = [ + async_tool_1.execute(), + sync_tool_1.execute(), + sync_tool_2.call(), + ] + print(results) + end_time = time.time() + print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}") + return results + + async def run_sync_and_async_mix(): + # both sync and async tool can use execute&to_thread + # async tool can also use acall without to_thread + # takes a bit over 2 seconds max(2) + start_time = time.time() + results = await asyncio.gather( + async_tool_1.execute(), + sync_tool_1.execute(), + async_tool_2.acall(), + ) + print(results) + end_time = time.time() + print(f"run_sync_and_async_mix time: {end_time - start_time}") + return results + + print(async_tool_1.execute()) + + run_sync_and_async_mix_without_wait() + asyncio.run(run_sync_and_async_mix()) diff --git a/lightrag/lightrag/core/functional.py b/lightrag/lightrag/core/functional.py new file mode 100644 index 00000000..a71ac206 --- /dev/null +++ b/lightrag/lightrag/core/functional.py @@ -0,0 +1,988 @@ +"""Functional interface. +Core functions we use to build across the components. +Users can leverage these functions to customize their own components.""" + +from typing import ( + Dict, + Any, + Callable, + Union, + List, + Tuple, + Optional, + Type, + get_type_hints, + get_origin, + get_args, + Set, + Sequence, +) +import logging +import numpy as np +import re +import json +import yaml +import ast +import threading +from copy import deepcopy + +from inspect import signature, Parameter +from dataclasses import fields, is_dataclass, MISSING, Field + +log = logging.getLogger(__name__) + +ExcludeType = Optional[Dict[str, List[str]]] + + +######################################################################################## +# For Dataclass base class and all schema related functions +######################################################################################## + + +def custom_asdict( + obj, *, dict_factory=dict, exclude: ExcludeType = None +) -> Dict[str, Any]: + """Equivalent to asdict() from dataclasses module but with exclude fields. + + Return the fields of a dataclass instance as a new dictionary mapping + field names to field values, while allowing certain fields to be excluded. + + If given, 'dict_factory' will be used instead of built-in dict. + The function applies recursively to field values that are + dataclass instances. This will also look into built-in containers: + tuples, lists, and dicts. + """ + if not is_dataclass_instance(obj): + raise TypeError("custom_asdict() should be called on dataclass instances") + return _asdict_inner(obj, dict_factory, exclude or {}) + + +def _asdict_inner(obj, dict_factory, exclude): + if is_dataclass_instance(obj): + result = [] + for f in fields(obj): + if f.name in exclude.get(obj.__class__.__name__, []): + continue + value = _asdict_inner(getattr(obj, f.name), dict_factory, exclude) + result.append((f.name, value)) + return dict_factory(result) + elif isinstance(obj, tuple) and hasattr(obj, "_fields"): + return type(obj)(*[_asdict_inner(v, dict_factory, exclude) for v in obj]) + elif isinstance(obj, (list, tuple)): + return type(obj)(_asdict_inner(v, dict_factory, exclude) for v in obj) + elif isinstance(obj, dict): + return type(obj)( + ( + _asdict_inner(k, dict_factory, exclude), + _asdict_inner(v, dict_factory, exclude), + ) + for k, v in obj.items() + ) + else: + return deepcopy(obj) + + +# def dataclass_obj_to_dict( +# obj: Any, exclude: ExcludeType = None, parent_key: str = "" +# ) -> Dict[str, Any]: +# r"""Convert a dataclass object to a dictionary With exclude fields. + +# Equivalent to asdict() from dataclasses module but with exclude fields. + +# Supports nested dataclasses, lists, and dictionaries. +# Allow exclude keys for each dataclass object. +# Example: + +# .. code-block:: python + +# from dataclasses import dataclass +# from typing import List + +# @dataclass +# class TrecData: +# question: str +# label: int + +# @dataclass +# class TrecDataList: + +# data: List[TrecData] +# name: str + +# trec_data = TrecData(question="What is the capital of France?", label=0) +# trec_data_list = TrecDataList(data=[trec_data], name="trec_data_list") + +# dataclass_obj_to_dict(trec_data_list, exclude={"TrecData": ["label"], "TrecDataList": ["name"]}) + +# # Output: +# # {'data': [{'question': 'What is the capital of France?'}]} + +# """ +# if not is_dataclass_instance(obj): +# raise ValueError( +# f"dataclass_obj_to_dict() should be called with a dataclass instance." +# ) +# if exclude is None: +# exclude = {} + +# obj_class_name = obj.__class__.__name__ +# current_exclude = exclude.get(obj_class_name, []) + +# if hasattr(obj, "__dataclass_fields__"): +# return { +# key: dataclass_obj_to_dict(value, exclude, parent_key=key) +# for key, value in obj.__dict__.items() +# if key not in current_exclude +# } +# elif isinstance(obj, list): + + +# return [dataclass_obj_to_dict(item, exclude, parent_key) for item in obj] +# elif isinstance(obj, set): +# return {dataclass_obj_to_dict(item, exclude, parent_key) for item in obj} +# elif isinstance(obj, tuple): +# return (dataclass_obj_to_dict(item, exclude, parent_key) for item in obj) +# elif isinstance(obj, dict): +# return { +# key: dataclass_obj_to_dict(value, exclude, parent_key) +# for key, value in obj.items() +# } +# else: +# return deepcopy(obj) +def validate_data(data: Dict[str, Any], fieldtypes: Dict[str, Any]) -> bool: + required_fields = { + name for name, type in fieldtypes.items() if _is_required_field(type) + } + return required_fields <= data.keys() + + +def is_potential_dataclass(t): + """Check if the type is directly a dataclass or potentially a wrapped dataclass like Optional.""" + origin = get_origin(t) + if origin is Union: + # This checks if any of the arguments in a Union (which is what Optional is) is a dataclass + return any(is_dataclass(arg) for arg in get_args(t) if arg is not type(None)) + return is_dataclass(t) + + +def extract_dataclass_type(type_hint): + """Extract the actual dataclass type from a type hint that could be Optional or other generic.""" + origin = get_origin(type_hint) + if origin in (Union, Optional): + # Unpack Optional[SomeClass] or Union[SomeClass, None] + args = get_args(type_hint) + for arg in args: + if arg is not type(None) and is_dataclass(arg): + return arg + return type_hint if is_dataclass(type_hint) else None + + +def dataclass_obj_from_dict(cls: Type[object], data: Dict[str, object]) -> Any: + r"""Convert a dictionary to a dataclass object. + + Supports nested dataclasses, lists, and dictionaries. + + .. note:: + If any required field is missing, it will raise an error. + Do not use the dict that has excluded required fields. + + Example: + + .. code-block:: python + + from dataclasses import dataclass + from typing import List + + @dataclass + class TrecData: + question: str + label: int + + @dataclass + class TrecDataList: + + data: List[TrecData] + name: str + + trec_data_dict = {"data": [{"question": "What is the capital of France?", "label": 0}], "name": "trec_data_list"} + + dataclass_obj_from_dict(TrecDataList, trec_data_dict) + + # Output: + # TrecDataList(data=[TrecData(question='What is the capital of France?', label=0)], name='trec_data_list') + + """ + + if is_dataclass(cls) or is_potential_dataclass( + cls + ): # Optional[Address] will be false, and true for each check + + log.debug( + f"{is_dataclass(cls)} of {cls}, {is_potential_dataclass(cls)} of {cls}" + ) + cls_type = extract_dataclass_type(cls) + fieldtypes = {f.name: f.type for f in cls_type.__dataclass_fields__.values()} + return cls_type( + **{ + key: dataclass_obj_from_dict(fieldtypes[key], value) + for key, value in data.items() + } + ) + elif isinstance(data, (list, tuple)): + restored_data = [] + for item in data: + if cls.__args__[0] and hasattr(cls.__args__[0], "__dataclass_fields__"): + # restore the value to its dataclass type + restored_data.append(dataclass_obj_from_dict(cls.__args__[0], item)) + else: + # Use the original data [Any] + restored_data.append(item) + + return restored_data + + elif isinstance(data, set): + restored_data = set() + for item in data: + if cls.__args__[0] and hasattr(cls.__args__[0], "__dataclass_fields__"): + # restore the value to its dataclass type + restored_data.add(dataclass_obj_from_dict(cls.__args__[0], item)) + else: + # Use the original data [Any] + restored_data.add(item) + + return restored_data + + elif isinstance(data, dict): + for key, value in data.items(): + if ( + hasattr(cls, "__args__") + and len(cls.__args__) > 1 + and cls.__args__[1] + and hasattr(cls.__args__[1], "__dataclass_fields__") + ): + # restore the value to its dataclass type + data[key] = dataclass_obj_from_dict(cls.__args__[1], value) + else: + # Use the original data [Any] + data[key] = value + return data + + else: + log.debug(f"Not datclass, or list, or dict: {cls}, use the original data.") + return data + + +# Custom representer for OrderedDict +def represent_ordereddict(dumper, data): + value = [] + for item_key, item_value in data.items(): + node_key = dumper.represent_data(item_key) + node_value = dumper.represent_data(item_value) + value.append((node_key, node_value)) + return yaml.MappingNode("tag:yaml.org,2002:map", value) + + +def from_dict_to_json(data: Dict[str, Any], sort_keys: bool = False) -> str: + r"""Convert a dictionary to a JSON string.""" + try: + return json.dumps(data, indent=4, sort_keys=sort_keys) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to convert dict to JSON: {e}") + + +def from_dict_to_yaml(data: Dict[str, Any], sort_keys: bool = False) -> str: + r"""Convert a dictionary to a YAML string.""" + try: + return yaml.dump(data, default_flow_style=False, sort_keys=sort_keys) + except yaml.YAMLError as e: + raise ValueError(f"Failed to convert dict to YAML: {e}") + + +def from_json_to_dict(json_str: str) -> Dict[str, Any]: + r"""Convert a JSON string to a dictionary.""" + try: + return json.loads(json_str) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to convert JSON to dict: {e}") + + +def from_yaml_to_dict(yaml_str: str) -> Dict[str, Any]: + r"""Convert a YAML string to a dictionary.""" + try: + return yaml.safe_load(yaml_str) + except yaml.YAMLError as e: + raise ValueError(f"Failed to convert YAML to dict: {e}") + + +def is_dataclass_instance(obj): + return hasattr(obj, "__dataclass_fields__") + + +def get_type_schema(type_obj, exclude: ExcludeType = None) -> str: + """Retrieve the type name, handling complex and nested types.""" + origin = get_origin(type_obj) + if origin is Union: + # Handle Optional[Type] and other unions + args = get_args(type_obj) + types = [get_type_schema(arg, exclude) for arg in args if arg is not type(None)] + return ( + f"Optional[{types[0]}]" if len(types) == 1 else f"Union[{', '.join(types)}]" + ) + elif origin in {List, list}: + args = get_args(type_obj) + if args: + inner_type = get_type_schema(args[0], exclude) + return f"List[{inner_type}]" + else: + return "List" + + elif origin in {Dict, dict}: + args = get_args(type_obj) + if args and len(args) >= 2: + key_type = get_type_schema(args[0], exclude) + value_type = get_type_schema(args[1], exclude) + return f"Dict[{key_type}, {value_type}]" + else: + return "Dict" + elif origin in {Set, set}: + args = get_args(type_obj) + return f"Set[{get_type_schema(args[0], exclude)}]" if args else "Set" + + elif origin is Sequence: + args = get_args(type_obj) + return f"Sequence[{get_type_schema(args[0], exclude)}]" if args else "Sequence" + + elif origin in {Tuple, tuple}: + args = get_args(type_obj) + if args: + return f"Tuple[{', '.join(get_type_schema(arg, exclude) for arg in args)}]" + return "Tuple" + + elif is_dataclass(type_obj): + # Recursively handle nested dataclasses + output = str(get_dataclass_schema(type_obj, exclude)) + return output + return type_obj.__name__ if hasattr(type_obj, "__name__") else str(type_obj) + + +def get_dataclass_schema( + cls, exclude: ExcludeType = None +) -> Dict[str, Dict[str, object]]: + """Generate a schema dictionary for a dataclass including nested structures. + + 1. Support customized dataclass with required_field function. + 2. Support nested dataclasses, even with generics like List, Dict, etc. + 3. Support metadata in the dataclass fields. + """ + if not is_dataclass(cls): + raise ValueError( + "Provided class is not a dataclass, please decorate your class with @dataclass" + ) + + schema = {"type": cls.__name__, "properties": {}, "required": []} + # get the exclude list for the current class + current_exclude = exclude.get(cls.__name__, []) if exclude else [] + for f in fields(cls): + if f.name in current_exclude: + continue + # prepare field schema, it weill be done recursively for nested dataclasses + field_schema = {"type": get_type_schema(f.type, exclude)} + + # check required field + is_required = _is_required_field(f) + if is_required: + schema["required"].append(f.name) + + # add metadata to the field schema + if f.metadata: + field_schema.update(f.metadata) + # handle nested dataclasses and complex types + + schema["properties"][f.name] = field_schema + + return schema + + +def _is_required_field(f: Field) -> bool: + r"""Determine if the field of dataclass is required or optional. + Customized for required_field function.""" + # Determine if the field is required or optional + # Using __name__ to check for function identity + if f.default is MISSING and ( + f.default_factory is MISSING + or ( + hasattr(f.default_factory, "__name__") + and f.default_factory.__name__ == "required_field" + ) + ): + return True + return False + + +def convert_schema_to_signature(schema: Dict[str, Dict[str, Any]]) -> Dict[str, str]: + r"""Convert the value from get_data_class_schema to a string description.""" + + signature = {} + schema_to_use = schema.get("properties", {}) + required_fields = schema.get("required", []) + for field_name, field_info in schema_to_use.items(): + field_signature = field_info.get("desc", "") + # add type to the signature + if field_info["type"]: + field_signature += f" ({field_info['type']})" + # add required/optional to the signature + if field_name in required_fields: + field_signature += " (required)" + else: + field_signature += " (optional)" + + signature[field_name] = field_signature + return signature + + +######################################################################################## +# For FunctionTool component +# It uses get_type_schema and get_dataclass_schema to generate the schema of arguments. +######################################################################################## +def get_fun_schema(name: str, func: Callable[..., object]) -> Dict[str, object]: + r"""Get the schema of a function. + Support dataclass, Union and normal data types such as int, str, float, etc, list, dict, set. + + Examples: + def example_function(x: int, y: str = "default") -> int: + return x + schema = get_fun_schema("example_function", example_function) + print(json.dumps(schema, indent=4)) + # Output: + { + "type": "object", + "properties": { + "x": { + "type": "int" + }, + "y": { + "type": "str", + "default": "default" + } + }, + "required": [ + "x" + ] + } + """ + sig = signature(func) + schema = {"type": "object", "properties": {}, "required": []} + type_hints = get_type_hints(func) + + for param_name, parameter in sig.parameters.items(): + param_type = type_hints.get(param_name, "Any") + if parameter.default == Parameter.empty: + schema["required"].append(param_name) + schema["properties"][param_name] = {"type": get_type_schema(param_type)} + else: + schema["properties"][param_name] = { + "type": get_type_schema(param_type), + "default": parameter.default, + } + + return schema + + +# For parse function call for FunctionTool component +def evaluate_ast_node(node: ast.AST, context_map: Dict[str, Any] = None): + """ + Recursively evaluates an AST node and returns the corresponding Python object. + + Args: + node (ast.AST): The AST node to evaluate. This node can represent various parts of Python expressions, + such as literals, identifiers, lists, dictionaries, and function calls. + context_map (Dict[str, Any]): A dictionary that maps variable names to their respective values and functions. + This context is used to resolve names and execute functions. + + Returns: + Any: The result of evaluating the node. The type of the returned object depends on the nature of the node: + - Constants return their literal value. + - Names are looked up in the context_map. + - Lists and tuples return their contained values as a list or tuple. + - Dictionaries return a dictionary with keys and values evaluated. + - Function calls invoke the function with evaluated arguments and return its result. + + Raises: + ValueError: If the node type is unsupported, a ValueError is raised indicating the inability to evaluate the node. + """ + if isinstance(node, ast.Constant): + return node.value + elif isinstance(node, ast.Dict): + return { + evaluate_ast_node(k): evaluate_ast_node(v) + for k, v in zip(node.keys, node.values) + } + elif isinstance(node, ast.List): + return [evaluate_ast_node(elem) for elem in node.elts] + elif isinstance(node, ast.Tuple): + return tuple(evaluate_ast_node(elem) for elem in node.elts) + elif isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub): + return -evaluate_ast_node(node.operand, context_map) # unary minus + elif isinstance( + node, ast.BinOp + ): # support "multiply(2024-2017, 12)", the "2024-2017" is a "BinOp" node + left = evaluate_ast_node(node.left, context_map) + right = evaluate_ast_node(node.right, context_map) + if isinstance(node.op, ast.Add): + return left + right + elif isinstance(node.op, ast.Sub): + return left - right + elif isinstance(node.op, ast.Mult): + return left * right + elif isinstance(node.op, ast.Div): + return left / right + elif isinstance(node.op, ast.Mod): + return left % right + elif isinstance(node.op, ast.Pow): + return left**right + else: + raise ValueError(f"Unsupported binary operator: {type(node.op)}") + elif isinstance(node, ast.Name): # variable name + try: + output_fun = context_map[node.id] + return output_fun + # TODO: raise the error back to the caller so that the llm can get the error message + except KeyError as e: + raise ValueError( + f"Error: {e}, {node.id} does not exist in the context_map." + ) + elif isinstance(node, ast.Attribute): # e.g. math.pi + value = evaluate_ast_node(node.value, context_map) + return getattr(value, node.attr) + + elif isinstance( + node, ast.Call + ): # another fun or class as argument and value, e.g. add( multiply(4,5), 3) + func = evaluate_ast_node(node.func, context_map) + args = [evaluate_ast_node(arg, context_map) for arg in node.args] + kwargs = { + kw.arg: evaluate_ast_node(kw.value, context_map) for kw in node.keywords + } + output = func(*args, **kwargs) + if hasattr(output, "raw_output"): + return output.raw_output + return output + else: + # directly evaluate the node + # print(f"Unsupported AST node type: {type(node)}") + # return eval(compile(ast.Expression(node), filename="", mode="eval")) + raise ValueError(f"Unsupported AST node type: {type(node)}") + + +def parse_function_call_expr( + function_expr: str, context_map: Dict[str, Any] = None +) -> Tuple[str, List[Any], Dict[str, Any]]: + """ + Parse a string representing a function call into its components and ensure safe execution by only allowing function calls from a predefined context map. + Args: + function_expr (str): The string representing the function + context_map (Dict[str, Any]): A dictionary that maps variable names to their respective values and functions. + This context is used to resolve names and execute functions. + """ + function_expr = function_expr.strip() + # Parse the string into an AST + tree = ast.parse(function_expr, mode="eval") + + if isinstance(tree.body, ast.Call): + # Extract the function name + func_name = tree.body.func.id if isinstance(tree.body.func, ast.Name) else None + + # Prepare the list of arguments and keyword arguments + args = [evaluate_ast_node(arg, context_map) for arg in tree.body.args] + keywords = { + kw.arg: evaluate_ast_node(kw.value, context_map) + for kw in tree.body.keywords + } + + return func_name, args, keywords + else: + raise ValueError("Provided string is not a function call.") + + +def generate_function_call_expression_from_callable( + func: Callable[..., Any], *args: Any, **kwargs: Any +) -> str: + """ + Generate a function call expression string from a callable function and its arguments. + + Args: + func (Callable[..., Any]): The callable function. + *args (Any): Positional arguments to be passed to the function. + **kwargs (Any): Keyword arguments to be passed to the function. + + Returns: + str: The function call expression string. + """ + func_name = func.__name__ + args_str = ", ".join(repr(arg) for arg in args) + kwargs_str = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items()) + + if args_str and kwargs_str: + full_args_str = f"{args_str}, {kwargs_str}" + else: + full_args_str = args_str or kwargs_str + + return f"{func_name}({full_args_str})" + + +# Define a list of safe built-ins +SAFE_BUILTINS = { + "abs": abs, + "all": all, + "any": any, + "bin": bin, + "bool": bool, + "bytearray": bytearray, + "bytes": bytes, + "callable": callable, + "chr": chr, + "complex": complex, + "dict": dict, + "divmod": divmod, + "enumerate": enumerate, + "filter": filter, + "float": float, + "format": format, + "frozenset": frozenset, + "getattr": getattr, + "hasattr": hasattr, + "hash": hash, + "hex": hex, + "int": int, + "isinstance": isinstance, + "issubclass": issubclass, + "iter": iter, + "len": len, + "list": list, + "map": map, + "max": max, + "min": min, + "next": next, + "object": object, + "oct": oct, + "ord": ord, + "pow": pow, + "range": range, + "repr": repr, + "reversed": reversed, + "round": round, + "set": set, + "slice": slice, + "sorted": sorted, + "str": str, + "sum": sum, + "tuple": tuple, + "type": type, + "zip": zip, +} + + +def sandbox_exec( + code: str, context: Optional[Dict[str, object]] = None, timeout: int = 5 +) -> Dict: + r"""Execute code in a sandboxed environment with a timeout. + + 1. Works similar to eval(), but with timeout and context similar to parse_function_call_expr. + 2. With more flexibility as you can write additional function in the code compared with simply the function call. + + Args: + code (str): The code to execute. Has to be output=... or similar so that the result can be captured. + context (Dict[str, Any]): The context to use for the execution. + timeout (int): The execution timeout in seconds. + + """ + result = {"output": None, "error": None} + context = {**context, **SAFE_BUILTINS} if context else SAFE_BUILTINS + try: + compiled_code = compile(code, "", "exec") + + # Result dictionary to store execution results + + # Define a target function for the thread + def target(): + try: + # Execute the code + exec(compiled_code, context, result) + except Exception as e: + result["error"] = e + + # Create a thread to execute the code + thread = threading.Thread(target=target) + thread.start() + thread.join(timeout) + + # Check if the thread is still alive (timed out) + if thread.is_alive(): + result["error"] = TimeoutError("Execution timed out") + raise TimeoutError("Execution timed out") + except Exception as e: + print(f"Errpr at sandbox_exec: {e}") + raise e + + return result + + +######################################################################################## +# For ** component +######################################################################################## +def compose_model_kwargs(default_model_kwargs: Dict, model_kwargs: Dict) -> Dict: + r""" + The model configuration exclude the input itself. + Combine the default model, model_kwargs with the passed model_kwargs. + Example: + model_kwargs = {"temperature": 0.5, "model": "gpt-3.5-turbo"} + self.model_kwargs = {"model": "gpt-3.5"} + combine_kwargs(model_kwargs) => {"temperature": 0.5, "model": "gpt-3.5-turbo"} + + """ + pass_model_kwargs = default_model_kwargs.copy() + + if model_kwargs: + pass_model_kwargs.update(model_kwargs) + return pass_model_kwargs + + +######################################################################################## +# For Tokenizer component +######################################################################################## +VECTOR_TYPE = Union[List[float], np.ndarray] + + +def is_normalized(v: VECTOR_TYPE, tol=1e-4) -> bool: + if isinstance(v, list): + v = np.array(v) + # Compute the norm of the vector (assuming v is 1D) + norm = np.linalg.norm(v) + # Check if the norm is approximately 1 + return np.abs(norm - 1) < tol + + +def normalize_np_array(v: np.ndarray) -> np.ndarray: + # Compute the norm of the vector (assuming v is 1D) + norm = np.linalg.norm(v) + # Normalize the vector + normalized_v = v / norm + # Return the normalized vector + return normalized_v + + +def normalize_vector(v: VECTOR_TYPE) -> List[float]: + if isinstance(v, list): + v = np.array(v) + # Compute the norm of the vector (assuming v is 1D) + norm = np.linalg.norm(v) + # Normalize the vector + normalized_v = v / norm + # Return the normalized vector as a list + return normalized_v.tolist() + + +def get_top_k_indices_scores( + scores: Union[List[float], np.ndarray], top_k: int +) -> Tuple[List[int], List[float]]: + if isinstance(scores, list): + scores_np = np.array(scores) + else: + scores_np = scores + top_k_indices = np.argsort(scores_np)[-top_k:][::-1] + top_k_scores = scores_np[top_k_indices] + return top_k_indices.tolist(), top_k_scores.tolist() + + +def generate_readable_key_for_function(fn: Callable) -> str: + + module_name = fn.__module__ + function_name = fn.__name__ + return f"{module_name}.{function_name}" + + +def extract_json_str(text: str, add_missing_right_brace: bool = True) -> str: + """ + Extract JSON string from text. + NOTE: Only handles the first JSON object or array found in the text. And it expects at least one JSON object in the text. + If right brace is not found, we add one to the end of the string. + """ + # NOTE: this regex parsing is taken from langchain.output_parsers.pydantic + text = text.strip() + start_obj = text.find("{") + start_arr = text.find("[") + if start_obj == -1 and start_arr == -1: + raise ValueError(f"No JSON object or array found in the text: {text}") + + start = min( + start_obj if start_obj != -1 else float("inf"), + start_arr if start_arr != -1 else float("inf"), + ) + open_brace = text[start] + # Attempt to find the matching closing brace + brace_count = 0 + end = -1 + for i in range(start, len(text)): + if text[i] == open_brace: + brace_count += 1 + elif text[i] == ("}" if open_brace == "{" else "]"): + brace_count -= 1 + + if brace_count == 0: + end = i + break + + if end == -1 and add_missing_right_brace: + # If no closing brace is found, but we are allowed to add one + log.debug("Adding missing right brace to the JSON string.") + text += "}" if open_brace == "{" else "]" + end = len(text) - 1 + elif end == -1: + raise ValueError( + "Incomplete JSON object found and add_missing_right_brace is False." + ) + + return text[start : end + 1] + + +def extract_list_str(text: str, add_missing_right_bracket: bool = True) -> str: + """ + Extract the first complete list string from the provided text. If the list string is incomplete + (missing the closing bracket), an option allows adding a closing bracket at the end. + + Args: + text (str): The text containing potential list data. + add_missing_right_bracket (bool): Whether to add a closing bracket if it is missing. + + Returns: + str: The extracted list string. + + Raises: + ValueError: If no list is found or if the list extraction is incomplete + without the option to add a missing bracket. + """ + text = text.strip() + start = text.find("[") + if start == -1: + raise ValueError("No list found in the text.") + + # Attempt to find the matching closing bracket + bracket_count = 0 + end = -1 + for i in range(start, len(text)): + if text[i] == "[": + bracket_count += 1 + elif text[i] == "]": + bracket_count -= 1 + + if bracket_count == 0: + end = i + break + + if end == -1 and add_missing_right_bracket: + # If no closing bracket is found, but we are allowed to add one + text += "]" + end = len(text) - 1 + elif end == -1: + raise ValueError( + "Incomplete list found and add_missing_right_bracket is False." + ) + + return text[start : end + 1] + + +def extract_yaml_str(text: str) -> str: + r"""Extract YAML string from text. + + In default, we use regex pattern to match yaml code blocks within triple backticks with optional yaml or yml prefix. + """ + try: + yaml_re_pattern: re.Pattern = re.compile( + r"^```(?:ya?ml)?(?P[^`]*)", re.MULTILINE | re.DOTALL + ) + match = yaml_re_pattern.search(text.strip()) + + yaml_str = "" + if match: + yaml_str = match.group("yaml") + else: + yaml_str = text.strip() + return yaml_str + except Exception as e: + raise ValueError(f"Failed to extract YAML from text: {e}") + + +def fix_json_missing_commas(json_str: str) -> str: + # Example: adding missing commas, only after double quotes + # Regular expression to find missing commas + regex = r'(?<=[}\]"\'\d])(\s+)(?=[\{"\[])' + + # Add commas where missing + fixed_json_str = re.sub(regex, r",\1", json_str) + + return fixed_json_str + + +def fix_json_escaped_single_quotes(json_str: str) -> str: + # First, replace improperly escaped single quotes inside strings + # json_str = re.sub(r"(? Dict[str, Any]: + r""" + Parse a YAML string to a Python object. + yaml_str: has to be a valid YAML string. + """ + try: + import yaml + + yaml_obj = yaml.safe_load(yaml_str) + return yaml_obj + except yaml.YAMLError as e: + raise ValueError( + f"Got invalid YAML object. Error: {e}. Got YAML string: {yaml_str}" + ) + except NameError as exc: + raise ImportError("Please pip install PyYAML.") from exc + + +def parse_json_str_to_obj(json_str: str) -> Dict[str, Any]: + r""" + Parse a JSON string to a Python object. + json_str: has to be a valid JSON string. Either {} or []. + """ + json_str = json_str.strip() + try: + json_obj = json.loads(json_str) + return json_obj + except json.JSONDecodeError: + # 2nd attemp after fixing the json string + try: + print("Trying to fix potential missing commas...") + json_str = fix_json_missing_commas(json_str) + print("Trying to fix scaped single quotes...") + json_str = fix_json_escaped_single_quotes(json_str) + print(f"Fixed JSON string: {json_str}") + json_obj = json.loads(json_str) + return json_obj + except json.JSONDecodeError: + # 3rd attemp using yaml + try: + import yaml + + # NOTE: parsing again with pyyaml + # pyyaml is less strict, and allows for trailing commas + # right now we rely on this since guidance program generates + # trailing commas + print("Parsing JSON string with PyYAML...") + json_obj = yaml.safe_load(json_str) + return json_obj + except yaml.YAMLError as e: + raise ValueError( + f"Got invalid JSON object. Error: {e}. Got JSON string: {json_str}" + ) + except NameError as exc: + raise ImportError("Please pip install PyYAML.") from exc diff --git a/lightrag/core/generator.py b/lightrag/lightrag/core/generator.py similarity index 82% rename from lightrag/core/generator.py rename to lightrag/lightrag/core/generator.py index b08a80ed..a111b10f 100644 --- a/lightrag/core/generator.py +++ b/lightrag/lightrag/core/generator.py @@ -2,18 +2,18 @@ from copy import deepcopy import logging -from lightrag.core.types import ModelType, GeneratorOutput +from lightrag.core.types import ( + ModelType, + GeneratorOutput, + GeneratorOutputType, +) from lightrag.core.component import Component from lightrag.core.parameter import Parameter from lightrag.core.prompt_builder import Prompt -from lightrag.core.functional import compose_model_kwargs from lightrag.core.model_client import ModelClient from lightrag.core.default_prompt_template import DEFAULT_LIGHTRAG_SYSTEM_PROMPT -GeneratorInputType = str -GeneratorOutputType = GeneratorOutput - log = logging.getLogger(__name__) @@ -31,8 +31,8 @@ class Generator(Component): model_client (ModelClient): The model client to use for the generator. model_kwargs (Dict[str, Any], optional): The model kwargs to pass to the model client. Defaults to {}. Please refer to :ref:`ModelClient` for the details on how to set the model_kwargs for your specific model if it is from our library. template (Optional[str], optional): The template for the prompt. Defaults to :ref:`DEFAULT_LIGHTRAG_SYSTEM_PROMPT`. - preset_prompt_kwargs (Optional[Dict], optional): The preset prompt kwargs to fill in the variables in the prompt. Defaults to None. - output_processors (Optional[Component], optional): The output processors after model call. Defaults to None. + prompt_kwargs (Optional[Dict], optional): The preset prompt kwargs to fill in the variables in the prompt. Defaults to None. + output_processors (Optional[Component], optional): The output processors after model call. It can be a single component or a chained component via ``Sequential``. Defaults to None. trainable_params (Optional[List[str]], optional): The list of trainable parameters. Defaults to []. Note: @@ -50,7 +50,7 @@ def __init__( model_kwargs: Dict[str, Any] = {}, # args for the prompt template: Optional[str] = None, - preset_prompt_kwargs: Optional[Dict] = {}, + prompt_kwargs: Optional[Dict] = {}, # args for the output processing output_processors: Optional[Component] = None, # args for the trainable parameters @@ -63,8 +63,8 @@ def __init__( - chat_history_str - context_str - steps_str - You can preset the prompt kwargs to fill in the variables in the prompt using preset_prompt_kwargs. - But you can replace the prompt and set any variables you want and use the preset_prompt_kwargs to fill in the variables. + You can preset the prompt kwargs to fill in the variables in the prompt using prompt_kwargs. + But you can replace the prompt and set any variables you want and use the prompt_kwargs to fill in the variables. """ if not isinstance(model_client, ModelClient): @@ -73,15 +73,20 @@ def __init__( ) template = template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT + try: + prompt_kwargs = deepcopy(prompt_kwargs) + except Exception as e: + log.warning(f"Error copying the prompt_kwargs: {e}") + prompt_kwargs = prompt_kwargs super().__init__( model_kwargs=model_kwargs, template=template, - preset_prompt_kwargs=preset_prompt_kwargs, + prompt_kwargs=prompt_kwargs, trainable_params=trainable_params, ) - self._init_prompt(template, preset_prompt_kwargs) + self._init_prompt(template, prompt_kwargs) self.model_kwargs = model_kwargs.copy() # init the model client @@ -98,18 +103,16 @@ def __init__( f"trainable_params: {param} not found in the prompt_variables: {prompt_variables}" ) # Create a Parameter object and assign it as an attribute with the same name as the value of param - default_value = self.preset_prompt_kwargs.get(param, None) + default_value = self.prompt_kwargs.get(param, None) setattr(self, param, Parameter[Union[str, None]](data=default_value)) self._trainable_params.append(param) # end of trainable parameters - def _init_prompt(self, template: str, preset_prompt_kwargs: Dict): - r"""Initialize the prompt with the template and preset_prompt_kwargs.""" + def _init_prompt(self, template: str, prompt_kwargs: Dict): + r"""Initialize the prompt with the template and prompt_kwargs.""" self.template = template - self.preset_prompt_kwargs = preset_prompt_kwargs - self.prompt = Prompt( - template=template, preset_prompt_kwargs=preset_prompt_kwargs - ) + self.prompt_kwargs = prompt_kwargs + self.prompt = Prompt(template=template, prompt_kwargs=prompt_kwargs) @classmethod def from_config(cls, config: Dict[str, Any]) -> "Generator": @@ -142,7 +145,7 @@ def from_config(cls, config: Dict[str, Any]) -> "Generator": # prompt_text = self.prompt.call(**kwargs) # return prompt_text - def update_default_model_kwargs(self, **model_kwargs) -> Dict: + def _compose_model_kwargs(self, **model_kwargs) -> Dict: r""" The model configuration exclude the input itself. Combine the default model, model_kwargs with the passed model_kwargs. @@ -152,7 +155,11 @@ def update_default_model_kwargs(self, **model_kwargs) -> Dict: combine_kwargs(model_kwargs) => {"temperature": 0.5, "model": "gpt-3.5-turbo"} """ - return compose_model_kwargs(self.model_kwargs, model_kwargs) + combined_model_kwargs = self.model_kwargs.copy() + + if model_kwargs: + combined_model_kwargs.update(model_kwargs) + return combined_model_kwargs def print_prompt(self, **kwargs) -> str: self.prompt.print_prompt(**kwargs) @@ -193,7 +200,7 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]: system_prompt_str = self.prompt.call(**prompt_kwargs).strip() # 2. combine the model_kwargs with the default model_kwargs - composed_model_kwargs = self.update_default_model_kwargs(**model_kwargs) + composed_model_kwargs = self._compose_model_kwargs(**model_kwargs) # 3. convert app's inputs to api inputs api_kwargs = self.model_client.convert_inputs_to_api_kwargs( @@ -225,10 +232,16 @@ def call( log.info(f"model_kwargs: {model_kwargs}") api_kwargs = self._pre_call(prompt_kwargs, model_kwargs) - completion = self.model_client.call( - api_kwargs=api_kwargs, model_type=self.model_type - ) - output = self._post_call(completion) + output: GeneratorOutputType = None + # call the model client + try: + completion = self.model_client.call( + api_kwargs=api_kwargs, model_type=self.model_type + ) + output = self._post_call(completion) + except Exception as e: + log.error(f"Error calling the model: {e}") + output = GeneratorOutput(error=str(e)) log.info(f"output: {output}") return output diff --git a/lightrag/core/model_client.py b/lightrag/lightrag/core/model_client.py similarity index 100% rename from lightrag/core/model_client.py rename to lightrag/lightrag/core/model_client.py diff --git a/lightrag/core/parameter.py b/lightrag/lightrag/core/parameter.py similarity index 100% rename from lightrag/core/parameter.py rename to lightrag/lightrag/core/parameter.py diff --git a/lightrag/core/prompt_builder.py b/lightrag/lightrag/core/prompt_builder.py similarity index 70% rename from lightrag/core/prompt_builder.py rename to lightrag/lightrag/core/prompt_builder.py index f3306ba7..4be8501a 100644 --- a/lightrag/core/prompt_builder.py +++ b/lightrag/lightrag/core/prompt_builder.py @@ -1,10 +1,12 @@ -import jinja2.meta -from jinja2 import Template, Environment -import jinja2 -from typing import Dict, Any, Optional, List, Type, TypeVar +"""Class prompt builder for LightRAG system prompt.""" + +from typing import Dict, Any, Optional, List, TypeVar import logging from functools import lru_cache +from jinja2 import Template, Environment, StrictUndefined, meta + + from lightrag.core.component import Component from lightrag.core.default_prompt_template import DEFAULT_LIGHTRAG_SYSTEM_PROMPT @@ -19,7 +21,7 @@ def get_jinja2_environment(): r"""Helper function for Prompt component to get the Jinja2 environment with the default settings.""" try: default_environment = Environment( - undefined=jinja2.StrictUndefined, + undefined=StrictUndefined, trim_blocks=True, keep_trailing_newline=True, lstrip_blocks=True, @@ -31,9 +33,9 @@ def get_jinja2_environment(): class Prompt(Component): - __doc__ = r"""A component that renders a text string from a template using Jinja2 templates. + __doc__ = r"""Renders a text string(prompt) from a Jinja2 template string. - In default, we use the :ref:`DEFAULT_LIGHTRAG_SYSTEM_PROMPT` as the template. + In default, we use the :ref:`DEFAULT_LIGHTRAG_SYSTEM_PROMPT` as the template. Args: template (str, optional): The Jinja2 template string. Defaults to DEFAULT_LIGHTRAG_SYSTEM_PROMPT. @@ -48,7 +50,7 @@ class Prompt(Component): >>> prompt.call(context_str="This is a context string.") When examples_str itself is another template with variables, You can use another Prompt to render it. - + >>> EXAMPLES_TEMPLATE = r''' >>> {% if examples %} >>> {% for example in examples %} @@ -64,9 +66,8 @@ class Prompt(Component): def __init__( self, - *, template: Optional[str] = None, - preset_prompt_kwargs: Optional[Dict] = {}, + prompt_kwargs: Optional[Dict] = {}, ): super().__init__() @@ -78,7 +79,7 @@ def __init__( logger.info(f"{__class__.__name__} has variables: {self.prompt_variables}") - self.preset_prompt_kwargs = preset_prompt_kwargs + self.prompt_kwargs = prompt_kwargs.copy() def __create_jinja2_template(self): r"""Create the Jinja2 template object.""" @@ -89,9 +90,9 @@ def __create_jinja2_template(self): except Exception as e: raise ValueError(f"Invalid Jinja2 template: {e}") - def update_preset_prompt_kwargs(self, **kwargs): - r"""Update the preset prompt kwargs after Prompt is initialized.""" - self.preset_prompt_kwargs.update(kwargs) + def update_prompt_kwargs(self, **kwargs): + r"""Update the initial prompt kwargs after Prompt is initialized.""" + self.prompt_kwargs.update(kwargs) def get_prompt_variables(self) -> List[str]: r"""Get the prompt kwargs.""" @@ -104,13 +105,13 @@ def is_key_in_template(self, key: str) -> bool: def _find_template_variables(self, template_str: str): """Automatically find all the variables in the template.""" parsed_content = self.jinja2_template.environment.parse(template_str) - return jinja2.meta.find_undeclared_variables(parsed_content) + return meta.find_undeclared_variables(parsed_content) def compose_prompt_kwargs(self, **kwargs) -> Dict: - r"""Compose the final prompt kwargs by combining the preset_prompt_kwargs and the provided kwargs.""" + r"""Compose the final prompt kwargs by combining the initial and the provided kwargs at runtime.""" composed_kwargs = {key: None for key in self.prompt_variables} - if self.preset_prompt_kwargs: - composed_kwargs.update(self.preset_prompt_kwargs) + if self.prompt_kwargs: + composed_kwargs.update(self.prompt_kwargs) if kwargs: for key, _ in kwargs.items(): if key not in composed_kwargs: @@ -121,9 +122,9 @@ def compose_prompt_kwargs(self, **kwargs) -> Dict: def print_prompt_template(self): r"""Print the template string.""" print("Template:") - print(f"-------") + print("-------") print(f"{self.template}") - print(f"-------") + print("-------") def print_prompt(self, **kwargs): r"""Print the rendered prompt string using the preset_prompt_kwargs and the provided kwargs.""" @@ -132,14 +133,14 @@ def print_prompt(self, **kwargs): logger.debug(f"Prompt kwargs: {pass_kwargs}") prompt_str = self.jinja2_template.render(**pass_kwargs) - print("Prompt:") + print("Prompt:\n______________________") print(prompt_str) except Exception as e: raise ValueError(f"Error rendering Jinja2 template: {e}") def call(self, **kwargs) -> str: """ - Renders the prompt template with the provided variables. + Renders the prompt template with keyword arguments. """ try: pass_kwargs = self.compose_prompt_kwargs(**kwargs) @@ -152,8 +153,8 @@ def call(self, **kwargs) -> str: def _extra_repr(self) -> str: s = f"template: {self.template}" - if self.preset_prompt_kwargs: - s += f", preset_prompt_kwargs: {self.preset_prompt_kwargs}" + if self.prompt_kwargs: + s += f", prompt_kwargs: {self.prompt_kwargs}" if self.prompt_variables: s += f", prompt_variables: {self.prompt_variables}" return s @@ -173,39 +174,3 @@ def to_dict(self) -> Dict[str, Any]: exclude = ["jinja2_template"] # unserializable object output = super().to_dict(exclude=exclude) return output - - -if __name__ == "__main__": - import logging - - logging.basicConfig(level=logging.DEBUG) - prompt = Prompt( - preset_prompt_kwargs={"task_desc_str": "You are a helpful assistant."} - ) - print(prompt) - prompt.print_prompt_template() - prompt.print_prompt(context_str="This is a context string.") - prompt.call(context_str="This is a context string.") - states = prompt.state_dict() - print(f"states: {states}") - named_params = prompt.named_parameters() - print(f"named_params: {named_params}") - for name, param in named_params: - print(f"{name}: {param}") - - # get dict of prompt - prompt_dict = prompt.to_dict() - print(f"prompt_dict: {prompt_dict}") - prompt_state = prompt.state_dict() - print(f"prompt_state: {prompt_state}") - - # EXAMPLES_TEMPLATE = r""" - # {% if examples %} - # {% for example in examples %} - # {{loop.index}}. {{example}} - # {% endfor %} - # {% endif %} - # """ - # examples_prompt = Prompt(template=EXAMPLES_TEMPLATE) - # examples_str = examples_prompt.call(examples=["Example 1", "Example 2"]) - # prompt.print_prompt(examples_str=examples_str) diff --git a/lightrag/core/retriever.py b/lightrag/lightrag/core/retriever.py similarity index 100% rename from lightrag/core/retriever.py rename to lightrag/lightrag/core/retriever.py diff --git a/lightrag/lightrag/core/string_parser.py b/lightrag/lightrag/core/string_parser.py new file mode 100644 index 00000000..e08ecd69 --- /dev/null +++ b/lightrag/lightrag/core/string_parser.py @@ -0,0 +1,102 @@ +""" +LLM applications requires lots of string processing. Such as the text output needed to be parsed into: +(1) JSON format or other formats +(2) SQL/Python valid format +(3) Tool(function) call format + +We design this these string_parser modules to be generic to any input text without differentiating them as input text or output text. +""" + +from typing import Any, Dict, List +import logging + +from lightrag.core.component import Component +import lightrag.core.functional as F + +log = logging.getLogger(__name__) + + +class ListParser(Component): + __doc__ = r"""To extract list strings from text and parse them into a list object. + + Examples: + + .. code-block:: python + + list_parser = ListParser() + test_input_4 = 'Some random text before ["item1", "item2"] and more after' + print(list_parser(test_input_4)) # Expected to extract ["item1", "item2"] + """ + + def __init__(self, add_missing_right_bracket: bool = True): + super().__init__() + self.add_missing_right_bracket = add_missing_right_bracket + + def __call__(self, input: str) -> List[Any]: + input = input.strip() + try: + list_str = F.extract_list_str(input, self.add_missing_right_bracket) + list_obj = F.parse_json_str_to_obj(list_str) + return list_obj + except Exception as e: + raise ValueError(f"Error: {e}") + + +JASON_PARSER_OUTPUT_TYPE = Dict[str, Any] + + +class JsonParser(Component): + __doc__ = r"""To extract JSON strings from text and parse them into a JSON object. + + Examples: + + .. code-block:: python + + json_parser = JsonParser() + json_str = "```json\n{\"key\": \"value\"}\n```" + json_obj = json_parser(json_str) + print(json_obj) # Expected to extract {"key": "value"} + """ + + def __init__(self, add_missing_right_brace: bool = True): + super().__init__() + self.add_missing_right_brace = add_missing_right_brace + + def call(self, input: str) -> JASON_PARSER_OUTPUT_TYPE: + input = input.strip() + try: + json_str = F.extract_json_str(input, self.add_missing_right_brace) + log.debug(f"json_str: {json_str}") + json_obj = F.parse_json_str_to_obj(json_str) + return json_obj + except Exception as e: + raise ValueError(f"Error: {e}") + + +YAML_PARSER_OUTPUT_TYPE = Dict[str, Any] + + +class YamlParser(Component): + __doc__ = r"""To extract YAML strings from text and parse them into a YAML object. + + Examples: + + .. code-block:: python + + yaml_parser = YamlParser() + yaml_str = "```yaml\nkey: value\n```" + yaml_obj = yaml_parser(yaml_str) + print(yaml_obj) # Expected to extract {"key": "value"} + """ + + def __init__(self): + super().__init__() + + def call(self, input: str) -> YAML_PARSER_OUTPUT_TYPE: + input = input.strip() + try: + yaml_str = F.extract_yaml_str(input) + yaml_obj = F.parse_yaml_str_to_obj(yaml_str) + return yaml_obj + except Exception as e: + raise ValueError(f"Error: {e}") diff --git a/lightrag/core/tokenizer.py b/lightrag/lightrag/core/tokenizer.py similarity index 100% rename from lightrag/core/tokenizer.py rename to lightrag/lightrag/core/tokenizer.py diff --git a/lightrag/lightrag/core/tool_manager.py b/lightrag/lightrag/core/tool_manager.py new file mode 100644 index 00000000..d835c658 --- /dev/null +++ b/lightrag/lightrag/core/tool_manager.py @@ -0,0 +1,142 @@ +from typing import List, Dict, Optional, Any, Callable, Awaitable, Union +import logging +from copy import deepcopy + +from lightrag.core import Component +from lightrag.core.func_tool import FunctionTool +from lightrag.core.types import ( + FunctionDefinition, + FunctionOutput, + Function, + FunctionExpression, +) + +from lightrag.core.functional import ( + parse_function_call_expr, + sandbox_exec, +) + +log = logging.getLogger(__name__) + + +AsyncCallable = Callable[..., Awaitable[Any]] + +ToolType = Union[FunctionTool, Callable[..., Any], Awaitable[Callable[..., Any]]] +ToolsType = List[ToolType] + + +# TODO: good to track all the failed function calls +class ToolManager(Component): + __doc__ = r""""Manage a list of tools, context, and all ways to execute functions. + + yaml and json definitions are for quick access to the definitions of the tools. + If you need more specification, such as using exclude field, you can use the function_definitions. + Args: + + + """ + + def __init__( + self, + tools: ToolsType = [], + additional_context: Optional[ + Dict[str, object] + ] = {}, # anything besides the tools + ): + super().__init__() + # super(LocalDB, self).__init__() + self.tools = [ + ( + FunctionTool(fn=deepcopy(tool)) + if not isinstance(tool, FunctionTool) + else deepcopy(tool) + ) + for tool in tools + ] + self._context_map = {tool.definition.func_name: tool for tool in self.tools} + self._additional_context = additional_context or {} + self.context = {**self._context_map, **self._additional_context} + log.info( + f"Initialized ToolManager with {len(self.tools)} tools and additional context {self._additional_context}" + ) + + @property + def yaml_definitions(self) -> List[str]: + return [tool.definition.to_yaml() for tool in self.tools] + + @property + def json_definitions(self) -> List[str]: + return [tool.definition.to_json() for tool in self.tools] + + @property + def function_definitions(self) -> List[FunctionDefinition]: + return [tool.definition for tool in self.tools] + + def parse_func_expr(self, expr: FunctionExpression) -> Function: + r"""Parse the function call expression.""" + try: + expr_str = expr.action + func_name, args, kwargs = parse_function_call_expr(expr_str, self.context) + return Function(name=func_name, args=args, kwargs=kwargs) + except Exception as e: + log.error(f"Error {e} parsing function call expression: {expr_str}") + raise ValueError(f"Error {e} parsing function call expression: {expr_str}") + + def execute_func(self, func: Function) -> FunctionOutput: + r"""Execute the function. Support both sync and async functions.""" + try: + tool = self.context[func.name] + return tool(*func.args, **func.kwargs) + except Exception as e: + log.error(f"Error {e} executing function: {func}") + raise ValueError(f"Error {e} executing function: {func}") + + def execute_func_expr(self, expr: FunctionExpression) -> FunctionOutput: + r"""Execute the function expression. Support both sync and async functions.""" + try: + func: Function = self.parse_func_expr(expr) + return self.execute_func(func) + except Exception as e: + log.error(f"Error {e} executing function expression: {expr}") + raise ValueError(f"Error {e} executing function expression: {expr}") + + def execute_func_expr_via_sandbox(self, expr: FunctionExpression) -> FunctionOutput: + r"""Execute the function expression via sandbox. Only support sync functions.""" + func_output = FunctionOutput( + name=expr.action, input=expr, output=None, error=None + ) + try: + action = ( + "output = " + expr.action + if not expr.action.startswith("output") + else expr.action + ) + result = sandbox_exec(action, self.context) + output = result.get("output", None) + error = result.get("error", None) + func_output.output = output + func_output.error = error + + except Exception as e: + log.error(f"Error {e} executing function expression: {expr}") + raise ValueError(f"Error {e} executing function expression: {expr}") + + return func_output + + def execute_func_expr_via_eval(self, expr: FunctionExpression) -> FunctionOutput: + r"""Execute the function expression via eval. Only support sync functions.""" + try: + result = eval(expr.action, self.context) + return FunctionOutput( + name=expr.action, + input=expr, + output=result, + error=None, + ) + except Exception as e: + log.error(f"Error {e} executing function expression: {expr}") + raise ValueError(f"Error {e} executing function expression: {expr}") + + def _extra_repr(self) -> str: + s = f"Tools: {self.tools}, Additional Context: {self._additional_context}" + return s diff --git a/lightrag/core/types.py b/lightrag/lightrag/core/types.py similarity index 59% rename from lightrag/core/types.py rename to lightrag/lightrag/core/types.py index 7cc3ac3d..efcea9a7 100644 --- a/lightrag/core/types.py +++ b/lightrag/lightrag/core/types.py @@ -1,7 +1,19 @@ """Functional data classes to support functional components like Generator, Retriever, and Assistant.""" from enum import Enum, auto -from typing import List, Dict, Any, Optional, Union, Generic, TypeVar, Sequence +from typing import ( + List, + Dict, + Any, + Optional, + Union, + Generic, + TypeVar, + Sequence, + Literal, + Callable, + Awaitable, +) from collections import OrderedDict from dataclasses import ( dataclass, @@ -13,9 +25,12 @@ import uuid import logging -from lightrag.core.base_data_class import DataClass +from lightrag.core.base_data_class import DataClass, required_field from lightrag.core.tokenizer import Tokenizer -from lightrag.core.functional import is_normalized +from lightrag.core.functional import ( + is_normalized, + generate_function_call_expression_from_callable, +) from lightrag.components.model_client import ( CohereAPIClient, TransformersClient, @@ -30,6 +45,9 @@ T_co = TypeVar("T_co", covariant=True) +####################################################################################### +# Data modeling for ModelClient +###################################################################################### class ModelType(Enum): EMBEDDER = auto() LLM = auto() @@ -69,6 +87,9 @@ def get_model_args(model_type: ModelType) -> List[str]: return [] +####################################################################################### +# Data modeling for Embedder component +###################################################################################### @dataclass class Embedding: """ @@ -91,14 +112,6 @@ class Usage: total_tokens: int -@dataclass -class TokenLogProb: - r"""similar to openai.ChatCompletionTokenLogprob""" - - token: str - logprob: float - - @dataclass class EmbedderOutput(DataClass): __doc__ = r"""Container to hold the response from an Embedder model. Only Per-batch. @@ -154,6 +167,17 @@ def is_normalized(self) -> bool: BatchEmbedderOutputType = List[EmbedderOutputType] +####################################################################################### +# Data modeling for Generator component +###################################################################################### +@dataclass +class TokenLogProb: + r"""similar to openai.ChatCompletionTokenLogprob""" + + token: str + logprob: float + + @dataclass class GeneratorOutput(DataClass, Generic[T_co]): __doc__ = r""" @@ -180,12 +204,264 @@ class GeneratorOutput(DataClass, Generic[T_co]): usage: Optional[Usage] = field(default=None, metadata={"desc": "Usage tracking"}) raw_response: Optional[str] = field( default=None, metadata={"desc": "Raw string response from the model"} + ) # parsed from model client response + metadata: Optional[Dict[str, object]] = field( + default=None, metadata={"desc": "Additional metadata"} + ) + + +GeneratorOutputType = GeneratorOutput[object] + +####################################################################################### +# Data modeling for Retriever component +###################################################################################### + +RetrieverQueryType = TypeVar("RetrieverQueryType", contravariant=True) +RetrieverStrQueryType = str +RetrieverQueriesType = Union[RetrieverQueryType, Sequence[RetrieverQueryType]] +RetrieverStrQueriesType = Union[str, Sequence[RetrieverStrQueryType]] + +RetrieverDocumentType = TypeVar("RetrieverDocumentType", contravariant=True) +RetrieverStrDocumentType = str # for text retrieval +RetrieverDocumentsType = Sequence[RetrieverDocumentType] + + +@dataclass +class RetrieverOutput(DataClass): + __doc__ = r"""Save the output of a single query in retrievers. + + It is up to the subclass of Retriever to specify the type of query and document. + """ + + doc_indices: List[int] = field(metadata={"desc": "List of document indices"}) + doc_scores: Optional[List[float]] = field( + default=None, metadata={"desc": "List of document scores"} + ) + query: Optional[RetrieverQueryType] = field( + default=None, metadata={"desc": "The query used to retrieve the documents"} + ) + documents: Optional[List[RetrieverDocumentType]] = field( + default=None, metadata={"desc": "List of retrieved documents"} + ) + + +RetrieverOutputType = List[RetrieverOutput] # so to support multiple queries at once + + +####################################################################################### +# Data modeling for function calls +###################################################################################### +AsyncCallable = Callable[..., Awaitable[Any]] + + +@dataclass +class FunctionDefinition(DataClass): + __doc__ = r"""The data modeling of a function definition, including the name, description, and parameters.""" + + func_name: str = field(metadata={"desc": "The name of the tool"}) + func_desc: Optional[str] = field( + default=None, metadata={"desc": "The description of the tool"} + ) + func_parameters: Dict[str, object] = field( + default_factory=dict, metadata={"desc": "The schema of the parameters"} + ) + + def fn_schema_str(self, type: Literal["json", "yaml"] = "json") -> str: + r"""Get the function definition str to be used in the prompt. + + You should also directly use :meth:`to_json` and :meth:`to_yaml` to get the schema in JSON or YAML format. + """ + if type == "json": + return self.to_json() + elif type == "yaml": + return self.to_yaml() + else: + raise ValueError(f"Unsupported type: {type}") + + +@dataclass +class Function(DataClass): + __doc__ = r"""The data modeling of a function call, including the name and keyword arguments. + + You can use the exclude in :meth:`to_json` and :meth:`to_yaml` to exclude the `thought` field if you do not want to use chain-of-thought pattern. + + Example: + + .. code-block:: python + + # assume the function is added in a context_map + # context_map = {"add": add} + + def add(a, b): + return a + b + + # call function add with arguments 1 and 2 + fun = Function(name="add", kwargs={"a": 1, "b": 2}) + # evaluate the function + result = context_map[fun.name](**fun.kwargs) + + # or call with positional arguments + fun = Function(name="add", args=[1, 2]) + result = context_map[fun.name](*fun.args) + """ + thought: Optional[str] = field( + default=None, metadata={"desc": "Why the function is called"} + ) + name: str = field(default="", metadata={"desc": "The name of the function"}) + args: Optional[List[object]] = field( + default_factory=list, + metadata={"desc": "The positional arguments of the function"}, + ) + kwargs: Optional[Dict[str, object]] = field( + default_factory=dict, + metadata={"desc": "The keyword arguments of the function"}, + ) + + +@dataclass +class FunctionExpression(DataClass): + __doc__ = r"""The data modeling of a function expression for a call, including the name and arguments. + + Example: + + .. code-block:: python + + def add(a, b): + return a + b + + # call function add with positional arguments 1 and 2 + fun_expr = FunctionExpression(action="add(1, 2)") + # evaluate the expression + result = eval(fun_expr.action) + print(result) + # Output: 3 + + # call function add with keyword arguments + fun_expr = FunctionExpression(action="add(a=1, b=2)") + result = eval(fun_expr.action) + print(result) + # Output: 3 + + Why asking LLM to generate function expression (code snippet) for a function call? + - It is more efficient/compact to call a function. + - It is more flexible. + (1) for the full range of Python expressions, including arithmetic operations, nested function calls, and more. + (2) allow to pass variables as arguments. + - Ease of parsing using ``ast`` module. + + The benefits are less failed function calls. + """ + thought: Optional[str] = field( + default=None, metadata={"desc": "Why the function is called"} + ) + action: str = field( + default_factory=required_field, + # metadata={"desc": "FuncName(, )"}, + metadata={ + "desc": """FuncName() \ + Valid function call expression. \ + Example: "FuncName(a=1, b=2)" \ + Follow the data type specified in the function parameters.\ + e.g. for Type object with x,y properties, use "ObjectType(x=1, y=2)""" + }, + ) + + @classmethod + def from_function( + cls, + func: Union[Callable[..., Any], AsyncCallable], + thought: Optional[str] = None, + *args, + **kwargs, + ) -> "FunctionExpression": + r"""Create a FunctionExpression object from a function. + + Args: + fun (Union[Callable[..., Any], AsyncCallable]): The function to be converted + + Returns: + FunctionExpression: The FunctionExpression object + + Usage: + 1. Create a FunctionExpression object from a function call: + 2. use :meth:`to_json` and :meth:`to_yaml` to get the schema in JSON or YAML format. + 3. This will be used as an example in prompt showing LLM how to call the function. + """ + try: + action = generate_function_call_expression_from_callable( + func, *args, **kwargs + ) + except Exception as e: + logger.error(f"Error generating function expression: {e}") + raise ValueError(f"Error generating function expression: {e}") + return cls(action=action, thought=thought) + + +# saves the output of a function tool. + + +@dataclass +class FunctionOutput(DataClass): + __doc__ = ( + r"""The output of a tool, which could be a function, a class, or a module.""" + ) + name: Optional[str] = field( + default=None, metadata={"desc": "The name of the function"} ) + input: Optional[Union[Function, FunctionExpression]] = field( + default=None, metadata={"desc": "The Function or FunctionExpression object"} + ) + parsed_input: Optional[Function] = field( + default=None, + metadata={ + "desc": "The parsed Function object if the input is FunctionExpression" + }, + ) + output: Optional[object] = field( + default=None, metadata={"desc": "The output of the function execution"} + ) + error: Optional[str] = field( + default=None, metadata={"desc": "The error message if any"} + ) + +####################################################################################### +# Data modeling for agent component +###################################################################################### +@dataclass +class StepOutput(DataClass): + __doc__ = r"""The output of a single step in the agent.""" + step: int = field( + default=0, metadata={"desc": "The order of the step in the agent"} + ) + thought: Optional[str] = field( + default="", metadata={"desc": "The thought of the agent in the step"} + ) + action: str = field( + default="", metadata={"desc": "The action of the agent in the step"} + ) + fun_name: Optional[str] = field( + default=None, metadata={"desc": "The function named parsed from action"} + ) + fun_args: Optional[List[Any]] = field( + default=None, + metadata={"desc": "The function positional arguments parsed from action"}, + ) + fun_kwargs: Optional[Dict[str, Any]] = field( + default=None, + metadata={"desc": "The function keyword arguments parsed from action"}, + ) + observation: Optional[str] = field( + default=None, metadata={"desc": "The result of the action"} + ) -GeneratorOutputType = GeneratorOutput[Any] + def __str__(self): + return f"Thought {self.step}: {self.thought}\nAction {self.step}: {self.action}\nObservation {self.step}: {self.observation}" +####################################################################################### +# Data modeling for data processing pipleline such as Text splitting and Embedding +###################################################################################### @dataclass class Document(DataClass): __doc__ = r"""A text container with optional metadata and vector representation. @@ -277,38 +553,9 @@ def __repr__(self): ) -RetrieverQueryType = TypeVar("RetrieverQueryType", contravariant=True) -RetrieverStrQueryType = str -RetrieverQueriesType = Union[RetrieverQueryType, Sequence[RetrieverQueryType]] -RetrieverStrQueriesType = Union[str, Sequence[RetrieverStrQueryType]] - -RetrieverDocumentType = TypeVar("RetrieverDocumentType", contravariant=True) -RetrieverStrDocumentType = str # for text retrieval -RetrieverDocumentsType = Sequence[RetrieverDocumentType] - - -@dataclass -class RetrieverOutput(DataClass): - __doc__ = r"""Save the output of a single query in retrievers. - - It is up to the subclass of Retriever to specify the type of query and document. - """ - - doc_indices: List[int] = field(metadata={"desc": "List of document indices"}) - doc_scores: Optional[List[float]] = field( - default=None, metadata={"desc": "List of document scores"} - ) - query: Optional[RetrieverQueryType] = field( - default=None, metadata={"desc": "The query used to retrieve the documents"} - ) - documents: Optional[List[RetrieverDocumentType]] = field( - default=None, metadata={"desc": "List of retrieved documents"} - ) - - -RetrieverOutputType = List[RetrieverOutput] # so to support multiple queries at once - - +####################################################################################### +# Data modeling for dialog system +###################################################################################### @dataclass class UserQuery: query_str: str @@ -356,8 +603,9 @@ class DialogTurn(DataClass): user_id: Optional[str] = field( default=None, metadata={"desc": "The unique id of the user"} ) - session_id: Optional[str] = field( - default=None, metadata={"desc": "The unique id of the dialog session"} + conversation_id: Optional[str] = field( + default=None, + metadata={"desc": "The unique id of the conversation it belongs to"}, ) order: Optional[int] = field( default=None, @@ -405,21 +653,38 @@ def set_assistant_response( self.assistant_response_timestamp = assistant_response_timestamp +# TODO: This part and the Memory class is still WIP, and will need more work in the future. @dataclass -class DialogSession: - __doc__ = r"""A dialog session manages the dialog turns in a whole conversation as a session. +class Conversation: + __doc__ = r"""A conversation manages the dialog turns in a whole conversation as a session. This class is mainly used in-memory for the dialog system/app to manage active conversations. You won't need this class for past conversations which have already been persisted in a database as a form of record or history. """ - id: str = field(default_factory=lambda: str(uuid.uuid4())) # the id of the session - user_id: Optional[str] = None - dialog_turns: OrderedDict[int, DialogTurn] = field(default_factory=OrderedDict) + id: str = field( + default_factory=lambda: str(uuid.uuid4()), + metadata={"desc": "The id of the conversation"}, + ) # the id of the conversation + name: Optional[str] = field( + default=None, metadata={"desc": "The name of the conversation"} + ) + user_id: Optional[str] = field( + default=None, metadata={"desc": "The id of the user"} + ) + dialog_turns: OrderedDict[int, DialogTurn] = field( + default_factory=OrderedDict, metadata={"desc": "The dialog turns"} + ) # int is the order of the turn, starts from 0 - metadata: Optional[Dict[str, Any]] = None - session_start_timestamp: Optional[datetime] = field(default_factory=datetime.now) + metadata: Optional[Dict[str, Any]] = field( + default=None, metadata={"desc": "Additional metadata"} + ) + + created_at: Optional[datetime] = field( + default_factory=datetime.now, + metadata={"desc": "The timestamp of the conversation creation"}, + ) # InitVar type annotation is used for parameters that are used in __post_init__ # but not meant to be fields in the dataclass. diff --git a/lightrag/database/README.md b/lightrag/lightrag/database/README.md similarity index 100% rename from lightrag/database/README.md rename to lightrag/lightrag/database/README.md diff --git a/lightrag/database/sqlalchemy/pipeline/__init__.py b/lightrag/lightrag/database/__init__.py similarity index 100% rename from lightrag/database/sqlalchemy/pipeline/__init__.py rename to lightrag/lightrag/database/__init__.py diff --git a/lightrag/icl/__init__.py b/lightrag/lightrag/database/sqlalchemy/__init__.py similarity index 100% rename from lightrag/icl/__init__.py rename to lightrag/lightrag/database/sqlalchemy/__init__.py diff --git a/lightrag/database/sqlalchemy/base.py b/lightrag/lightrag/database/sqlalchemy/base.py similarity index 100% rename from lightrag/database/sqlalchemy/base.py rename to lightrag/lightrag/database/sqlalchemy/base.py diff --git a/lightrag/database/sqlalchemy/model.py b/lightrag/lightrag/database/sqlalchemy/model.py similarity index 100% rename from lightrag/database/sqlalchemy/model.py rename to lightrag/lightrag/database/sqlalchemy/model.py diff --git a/lightrag/lightrag/database/sqlalchemy/pipeline/__init__.py b/lightrag/lightrag/database/sqlalchemy/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lightrag/database/sqlalchemy/pipeline/create_tables.py b/lightrag/lightrag/database/sqlalchemy/pipeline/create_tables.py similarity index 100% rename from lightrag/database/sqlalchemy/pipeline/create_tables.py rename to lightrag/lightrag/database/sqlalchemy/pipeline/create_tables.py diff --git a/lightrag/database/sqlalchemy/pipeline/default_config.py b/lightrag/lightrag/database/sqlalchemy/pipeline/default_config.py similarity index 100% rename from lightrag/database/sqlalchemy/pipeline/default_config.py rename to lightrag/lightrag/database/sqlalchemy/pipeline/default_config.py diff --git a/lightrag/database/sqlalchemy/pipeline/inject_data.py b/lightrag/lightrag/database/sqlalchemy/pipeline/inject_data.py similarity index 100% rename from lightrag/database/sqlalchemy/pipeline/inject_data.py rename to lightrag/lightrag/database/sqlalchemy/pipeline/inject_data.py diff --git a/lightrag/database/sqlalchemy/sqlachemy_manager.py b/lightrag/lightrag/database/sqlalchemy/sqlachemy_manager.py similarity index 100% rename from lightrag/database/sqlalchemy/sqlachemy_manager.py rename to lightrag/lightrag/database/sqlalchemy/sqlachemy_manager.py diff --git a/lightrag/eval/__init__.py b/lightrag/lightrag/eval/__init__.py similarity index 100% rename from lightrag/eval/__init__.py rename to lightrag/lightrag/eval/__init__.py diff --git a/lightrag/eval/answer_match_acc.py b/lightrag/lightrag/eval/answer_match_acc.py similarity index 100% rename from lightrag/eval/answer_match_acc.py rename to lightrag/lightrag/eval/answer_match_acc.py diff --git a/lightrag/eval/llm_as_judge.py b/lightrag/lightrag/eval/llm_as_judge.py similarity index 97% rename from lightrag/eval/llm_as_judge.py rename to lightrag/lightrag/eval/llm_as_judge.py index 55646f66..19cd6c2b 100644 --- a/lightrag/eval/llm_as_judge.py +++ b/lightrag/lightrag/eval/llm_as_judge.py @@ -43,7 +43,7 @@ class DefaultLLMJudge(Component): __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query. You can use any any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False. - + A call on the LLM judge equalize to _compute_single_item method. Args: @@ -59,7 +59,7 @@ def __init__( super().__init__() self.model_client = model_client if model_client is None: - log.info(f"model_client is None, default to OpenAIClient.") + log.info("model_client is None, default to OpenAIClient.") try: from lightrag.components.model_client import OpenAIClient except ImportError: @@ -172,8 +172,6 @@ def compute( if __name__ == "__main__": - from lightrag.utils import setup_env - from lightrag.components.model_client import OpenAIClient questions = [ "Is Beijing in China?", diff --git a/lightrag/eval/retriever_recall.py b/lightrag/lightrag/eval/retriever_recall.py similarity index 100% rename from lightrag/eval/retriever_recall.py rename to lightrag/lightrag/eval/retriever_recall.py diff --git a/lightrag/eval/retriever_relevance.py b/lightrag/lightrag/eval/retriever_relevance.py similarity index 100% rename from lightrag/eval/retriever_relevance.py rename to lightrag/lightrag/eval/retriever_relevance.py diff --git a/lightrag/icl/README.md b/lightrag/lightrag/icl/README.md similarity index 68% rename from lightrag/icl/README.md rename to lightrag/lightrag/icl/README.md index 9e8133fb..ed28807c 100644 --- a/lightrag/icl/README.md +++ b/lightrag/lightrag/icl/README.md @@ -1,3 +1,3 @@ -ICL with few-shots or many-shots if you have a large-context LLM is a must when we bootstrap any ML tasks or to compare with model finetune performances. +ICL with few-shots or many-shots if you have a large-context LLM is a must when we bootstrap any ML tasks or to compare with model finetune performances. -When ICL is used for classical ML like classification, if we have the logits of tokens, we can use `constrainded decoding` to \ No newline at end of file +When ICL is used for classical ML like classification, if we have the logits of tokens, we can use `constrainded decoding` to diff --git a/lightrag/lightrag/icl/__init__.py b/lightrag/lightrag/icl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lightrag/icl/retrieval_icl.py b/lightrag/lightrag/icl/retrieval_icl.py similarity index 100% rename from lightrag/icl/retrieval_icl.py rename to lightrag/lightrag/icl/retrieval_icl.py diff --git a/lightrag/optim/__init__.py b/lightrag/lightrag/optim/__init__.py similarity index 60% rename from lightrag/optim/__init__.py rename to lightrag/lightrag/optim/__init__.py index 6c97a705..76a50daf 100644 --- a/lightrag/optim/__init__.py +++ b/lightrag/lightrag/optim/__init__.py @@ -1,6 +1,6 @@ -from .few_shot_optimizer import * -from .llm_optimizer import * -from .optimizer import * +from .few_shot_optimizer import BootstrapFewShot +from .llm_optimizer import LLMOptimizer +from .optimizer import Optimizer from .sampler import RandomSampler, ClassSampler, Sampler __all__ = [ diff --git a/lightrag/optim/few_shot_optimizer.py b/lightrag/lightrag/optim/few_shot_optimizer.py similarity index 97% rename from lightrag/optim/few_shot_optimizer.py rename to lightrag/lightrag/optim/few_shot_optimizer.py index 27737ea5..101b7720 100644 --- a/lightrag/optim/few_shot_optimizer.py +++ b/lightrag/lightrag/optim/few_shot_optimizer.py @@ -84,7 +84,7 @@ def random_replace( ): assert ( len(self.current) == self.num_shots - ), f"Ensure you have called init() first to setup the current examples before replacing a subset of them." + ), "Ensure you have called init() first to setup the current examples before replacing a subset of them." self.proposed = self.sampler.random_replace( shots, deepcopy(self.current), weights_per_class=weights_per_class ) diff --git a/lightrag/optim/llm_augment.py b/lightrag/lightrag/optim/llm_augment.py similarity index 96% rename from lightrag/optim/llm_augment.py rename to lightrag/lightrag/optim/llm_augment.py index b56bb4f3..b18530ff 100644 --- a/lightrag/optim/llm_augment.py +++ b/lightrag/lightrag/optim/llm_augment.py @@ -10,7 +10,7 @@ from lightrag.core import Generator, GeneratorOutput from lightrag.core.component import Component from lightrag.core.base_data_class import DataClass -from lightrag.core.string_parser import YAMLParser +from lightrag.core.string_parser import YamlParser from lightrag.components.output_parsers import YAML_OUTPUT_FORMAT @@ -18,7 +18,7 @@ LLM_AUGMENTER_TEMPLATE = r"""Given inputs and outputs, you will fill in any field that is missing value. - null or '' means the field is missing. -- Understand the reasoning between inputs and outputs fields. If the 'thought/reasoning' field is null, you will fill in the reasoning +- Understand the reasoning between inputs and outputs fields. If the 'thought/reasoning' field is null, you will fill in the reasoning between the inputs and existing outputs and explain it well. - You answer will only include the missing fields along with your values - {{yaml_format_str}} @@ -64,7 +64,7 @@ def __init__( self.generator = Generator( model_client=model_client, model_kwargs=model_kwargs, - output_processors=YAMLParser(), + output_processors=YamlParser(), template=LLM_AUGMENTER_TEMPLATE, preset_prompt_kwargs={ "task_context_str": task_context_str, diff --git a/lightrag/optim/llm_optimizer.py b/lightrag/lightrag/optim/llm_optimizer.py similarity index 98% rename from lightrag/optim/llm_optimizer.py rename to lightrag/lightrag/optim/llm_optimizer.py index 857527e3..5a615043 100644 --- a/lightrag/optim/llm_optimizer.py +++ b/lightrag/lightrag/optim/llm_optimizer.py @@ -4,7 +4,7 @@ """ from typing import Dict, Any, List, Optional -from dataclasses import dataclass, field +from dataclasses import field from copy import deepcopy from lightrag.core.base_data_class import DataClass @@ -30,8 +30,8 @@ Below are some of your previous instructions and their scores, the higher the score the better the instruction: {% for instruction in instructions %} -- {{loop.index}}. -- text: {{instruction.text}} +- {{loop.index}}. +- text: {{instruction.text}} - score: {{instruction.score}}) {% if instruction.responses is defined %} - responses: {{instruction.responses}} diff --git a/lightrag/optim/optimizer.py b/lightrag/lightrag/optim/optimizer.py similarity index 100% rename from lightrag/optim/optimizer.py rename to lightrag/lightrag/optim/optimizer.py diff --git a/lightrag/optim/sampler.py b/lightrag/lightrag/optim/sampler.py similarity index 100% rename from lightrag/optim/sampler.py rename to lightrag/lightrag/optim/sampler.py diff --git a/lightrag/tracing/__init__.py b/lightrag/lightrag/tracing/__init__.py similarity index 100% rename from lightrag/tracing/__init__.py rename to lightrag/lightrag/tracing/__init__.py diff --git a/lightrag/tracing/decorators.py b/lightrag/lightrag/tracing/decorators.py similarity index 97% rename from lightrag/tracing/decorators.py rename to lightrag/lightrag/tracing/decorators.py index 0311ef7b..c360b3d7 100644 --- a/lightrag/tracing/decorators.py +++ b/lightrag/lightrag/tracing/decorators.py @@ -15,13 +15,13 @@ def trace_generator_states( project_name: Optional[str] = None, filename: Optional[str] = None, ): - __doc__ = r"""Decorator to trace generators in a task component. + r"""Decorator to trace generators in a task component. It dynamically attaches a GeneratorLogger to the target generator attribute and logs the prompt states of the generator. You can use it on any component that has attributes pointing to a generator object. Args: - attributes (List[str], Optional): The list of attributes that point to the generator objects. + attributes (List[str], Optional): The list of attributes that point to the generator objects. If not provided, it will automatically detect the attributes that are instances of Generator. filepath (str, Optional): The path to the directory where the trace file will be saved. Default is "./traces/". filename (str, Optional): The name of the trace file. If not provided, it will be "{class_name}_generator_trace.json". @@ -101,7 +101,7 @@ def trace_generator_call( save_dir: Optional[str] = "./traces/", error_only: bool = True, ): - __doc__ = r"""Decorator to trace generator predictions in a task component, especially failed ones. + r"""Decorator to trace generator predictions in a task component, especially failed ones. This decorator is a wrapper around the generator call method. It logs the generator call by reading its GeneratorOutput and logs the call if the output is an error. @@ -123,7 +123,7 @@ def trace_generator_call( >>> ) >>> # now you will see ./traces/TestGenerator dir being created. >>> # If the generator call has an error, it will be logged in the error file generator_call.jsonl - + If you want to decorate a component(such as LLMRetriever) from the library where you do not have access to the source code, you can do it like this: .. code-block:: python diff --git a/lightrag/tracing/generator_call_logger.py b/lightrag/lightrag/tracing/generator_call_logger.py similarity index 100% rename from lightrag/tracing/generator_call_logger.py rename to lightrag/lightrag/tracing/generator_call_logger.py diff --git a/lightrag/tracing/generator_state_logger.py b/lightrag/lightrag/tracing/generator_state_logger.py similarity index 100% rename from lightrag/tracing/generator_state_logger.py rename to lightrag/lightrag/tracing/generator_state_logger.py diff --git a/lightrag/utils/__init__.py b/lightrag/lightrag/utils/__init__.py similarity index 94% rename from lightrag/utils/__init__.py rename to lightrag/lightrag/utils/__init__.py index e190e2e5..64f2865c 100644 --- a/lightrag/utils/__init__.py +++ b/lightrag/lightrag/utils/__init__.py @@ -18,6 +18,7 @@ from .registry import EntityMapping from .config import new_components_from_config, new_component from .lazy_import import LazyImport, OptionalPackages, safe_import +from .setup_env import setup_env __all__ = [ @@ -42,4 +43,5 @@ "append_to_jsonl", "write_list_to_jsonl", "safe_import", + "setup_env", ] diff --git a/lightrag/utils/config.py b/lightrag/lightrag/utils/config.py similarity index 92% rename from lightrag/utils/config.py rename to lightrag/lightrag/utils/config.py index 8eec4d57..c2be00fa 100644 --- a/lightrag/utils/config.py +++ b/lightrag/lightrag/utils/config.py @@ -6,19 +6,19 @@ Example: { # attribute and its config to recreate the component "document_splitter": { - "entity_name": "DocumentSplitter", - "entity_config": { + "component_name": "DocumentSplitter", + "component_config": { "split_by": "word", "split_length": 400, "split_overlap": 200, }, }, "to_embeddings": { - "entity_name": "ToEmbeddings", - "entity_config": { + "component_name": "ToEmbeddings", + "component_config": { "embedder": { - "entity_name": "Embedder", - "entity_config": { + "component_name": "Embedder", + "component_config": { "model_client": { "entity_name": "OpenAIClient", "entity_config": {}, diff --git a/lightrag/utils/file_io.py b/lightrag/lightrag/utils/file_io.py similarity index 100% rename from lightrag/utils/file_io.py rename to lightrag/lightrag/utils/file_io.py diff --git a/lightrag/utils/lazy_import.py b/lightrag/lightrag/utils/lazy_import.py similarity index 100% rename from lightrag/utils/lazy_import.py rename to lightrag/lightrag/utils/lazy_import.py diff --git a/lightrag/utils/logger.py b/lightrag/lightrag/utils/logger.py similarity index 96% rename from lightrag/utils/logger.py rename to lightrag/lightrag/utils/logger.py index f4296a8e..ecc9882d 100644 --- a/lightrag/utils/logger.py +++ b/lightrag/lightrag/utils/logger.py @@ -20,7 +20,7 @@ import logging import sys -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Literal import inspect import os from datetime import datetime @@ -49,7 +49,7 @@ def _get_log_config( - level: str = "INFO", + level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", filepath: str = "./logs/app.log", enable_console: bool = True, enable_file: bool = True, @@ -97,7 +97,7 @@ def get_level(level: str) -> int: def enable_library_logging( - level: str = "INFO", + level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", enable_console: bool = True, enable_file: bool = False, save_dir: Optional[str] = None, @@ -168,7 +168,7 @@ def enable_library_logging( def get_logger( name: str, - level: str = "INFO", + level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", # filename: str = "./logs/app.log", save_dir: Optional[str] = None, filename: Optional[str] = None, diff --git a/lightrag/utils/registry.py b/lightrag/lightrag/utils/registry.py similarity index 96% rename from lightrag/utils/registry.py rename to lightrag/lightrag/utils/registry.py index 0b74f120..a89db713 100644 --- a/lightrag/utils/registry.py +++ b/lightrag/lightrag/utils/registry.py @@ -4,7 +4,7 @@ class EntityMapping: __doc__ = r"""A registry for entities, components,classes, function. - This can be used to configure classes, functions, or components in a registry. + This can be used to configure classes, functions, or components in a registry. """ _registry: Dict[str, Type] = {} diff --git a/lightrag/utils/serialization.py b/lightrag/lightrag/utils/serialization.py similarity index 100% rename from lightrag/utils/serialization.py rename to lightrag/lightrag/utils/serialization.py diff --git a/lightrag/lightrag/utils/setup_env.py b/lightrag/lightrag/utils/setup_env.py new file mode 100644 index 00000000..6255b060 --- /dev/null +++ b/lightrag/lightrag/utils/setup_env.py @@ -0,0 +1,5 @@ +import dotenv + + +def setup_env(): + dotenv.load_dotenv(dotenv_path=".env", override=True) diff --git a/lightrag/poetry.lock b/lightrag/poetry.lock index 36328688..9c1a98e2 100644 --- a/lightrag/poetry.lock +++ b/lightrag/poetry.lock @@ -233,67 +233,68 @@ test = ["pytest (>=6)"] [[package]] name = "faiss-cpu" -version = "1.8.0" +version = "1.8.0.post1" description = "A library for efficient similarity search and clustering of dense vectors." optional = false python-versions = ">=3.8" files = [ - {file = "faiss-cpu-1.8.0.tar.gz", hash = "sha256:3ee1549491728f37b65267c192a94661a907154a8ae0546ad50a564b8be0d82e"}, - {file = "faiss_cpu-1.8.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:134a064c7411acf7d1d863173a9d2605c5a59bd573639ab39a5ded5ca983b1b2"}, - {file = "faiss_cpu-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ba8e6202d561ac57394c9d691ff17f8fa6eb9a077913a993fce0a154ec0176f1"}, - {file = "faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66e9fa7b70556a39681f06e0652f4124c8ddb0a1924afe4f0e40b6924dc845b"}, - {file = "faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51aaef5a1255d0ea88ea7e52a2415f98c5dd2dd9cec10348d55136541eeec99f"}, - {file = "faiss_cpu-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:38152761242870ec7019e0397cbd0ed0b0716562029ce41a71bb38448bd6d5bc"}, - {file = "faiss_cpu-1.8.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c9e6ad94b86626be1a0faff3e53c4ca169eba88aa156d7e90c5a2e9ba30558fb"}, - {file = "faiss_cpu-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4601dbd81733bf1bc3bff690aac981289fb386dc8e60d0c4eec8a37ba6856d20"}, - {file = "faiss_cpu-1.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa943d3b5e8c5c77cdd629d9c3c6f78d7da616e586fdd1b94aecbf2e5fa9ba06"}, - {file = "faiss_cpu-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b644b366c3b239b34fa3e08bf65bfc78a24eda1e1ea5b2b6d9be3e8fc73d8179"}, - {file = "faiss_cpu-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:f85ecf3514850f93985be238351f5a70736133cfae784b372640aa17c6343a1b"}, - {file = "faiss_cpu-1.8.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:61abc0129a357ac00f17f5167f14dff41480de2cc852f306c3d4cd36b893ccbd"}, - {file = "faiss_cpu-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b788186d6eb94e6333e1aa8bb6c84b66e967458ecdd1cee22e16f04c43ee674c"}, - {file = "faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5658d90a202c62e4a69c5b065785e9ddcaf6986cb395c16afed8dbe4c58c31a2"}, - {file = "faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d460a372efce547e53d3c47d2c2a8a90b186ad245969048c10c1d7a1e5cf21b"}, - {file = "faiss_cpu-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:9e6520324f0a6764dd267b3c32c76958bf2b1ec36752950f6fab31a7295980a0"}, - {file = "faiss_cpu-1.8.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:fc44be179d5b7f690484ef0d0caf817fea2698a5275a0c7fb6cbf406e5b2e4d1"}, - {file = "faiss_cpu-1.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bbd6f0bc2e1424a12dc7e19d2cc95b53124867966b21110d26f909227e7ed1f1"}, - {file = "faiss_cpu-1.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06e7add0c8a06ce8fb0443c38fcaf49c45fb74527ea633b819e56452608e64f5"}, - {file = "faiss_cpu-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b864e23c1817fa6cfe9bbec096fd7140d596002934f71aa89b196ffb1b9cd846"}, - {file = "faiss_cpu-1.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:655433755845adbb6f0961e2f8980703640cb9faa96f1cd1ea190252149e0d0a"}, - {file = "faiss_cpu-1.8.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:e81fc376a3bcda213ffb395dda1018c953ce927c587731ad582f4e6c2b225363"}, - {file = "faiss_cpu-1.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8c6fa6b7eaf558307b4ab118a236e8d1da79a8685222928e4dd52e277dba144a"}, - {file = "faiss_cpu-1.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:652f6812ef2e8b0f9b18209828c590bc618aca82e7f1c1b1888f52928258e406"}, - {file = "faiss_cpu-1.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:304da4e0d19044374b63a5b6467028572eac4bd3f32bc9e8783d800a03fb1f02"}, - {file = "faiss_cpu-1.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:cb475d3f25f08c97ac64dfe026f113e2aeb9829b206b3b046256c3b40dd7eb62"}, + {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:fd84721eb599aa1da19b1b36345bb8705a60bb1d2887bbbc395a29e3d36a1a62"}, + {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b78ff9079d15fd0f156bf5dd8a2975a8abffac1854a86ece263eec1500a2e836"}, + {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de25c943d1789e35fe06a20884c88cd32aedbb1a33bb8da2238cdea7bd9633f"}, + {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adae0f1b144e7216da696f14bc4991ca4300c94baaa59247c3d322588e661c95"}, + {file = "faiss_cpu-1.8.0.post1-cp310-cp310-win_amd64.whl", hash = "sha256:00345290680a444a4b4cb2d98a3844bb5c401a2160fee547c7631d759fd2ec3e"}, + {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8d4bade10cb63e9f9ff261751edd7eb097b1f4bf30be4d0d25d6f688559d795e"}, + {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:20bd43eca3b7d77e71ea56b7a558cc28e900d8abff417eb285e2d92e95d934d4"}, + {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8542a87743a7f94ac656fd3e9592ad57e58b04d961ad2fe654a22a8ca59defdb"}, + {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed46928de3dc20170b10fec89c54075a11383c2aaf4f119c63e0f6ae5a507d74"}, + {file = "faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl", hash = "sha256:4fa5fc8ea210b919aa469e27d6687e50052db906e7fec3f2257178b1384fa18b"}, + {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:96aec0d08a3099883af3a9b6356cfe736e8bd879318a940a27e9d1ae6f33d788"}, + {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:92b06147fa84732ecdc965922e8ef50dc7011ef8be65821ff4abb2118cb5dce0"}, + {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:709ef9394d1148aef70dbe890edbde8c282a4a2e06a8b69ab64f65e90f5ba572"}, + {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:327a9c30971bf72cd8392b15eb4aff5d898c453212eae656dfaa3ba555b9ca0c"}, + {file = "faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl", hash = "sha256:8756f1d93faba56349883fa2f5d47fe36bb2f11f789200c6b1c691ef805485f2"}, + {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f4a3045909c447bf1955b70083891e80f2c87c5427f20cae25245e08ec5c9e52"}, + {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8842b7fc921ca1fafdb0845f2ba029e79df04eebae72ab135239f93478a9b7a2"}, + {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d5a9799634e32c3862d5436d1e78112ed9a38f319e4523f5916e55d86adda8f"}, + {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a70923b0fbbb40f647e20bcbcbfd472277e6d84bb23ff12d2a94b6841806b55"}, + {file = "faiss_cpu-1.8.0.post1-cp38-cp38-win_amd64.whl", hash = "sha256:ce652df3c4dd50c88ac9235d072f30ce60694dc422c5f523bbbcab320e8f3097"}, + {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:83ef04b17b19189dd6601a941bdf4bfa9de0740dbcd80305aeba51a1b1955f80"}, + {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c50c8697077470ede7f1939ef8dc8a846ec19cf1893b543f6b67f9af03b0a122"}, + {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98ce428a7a67fe5c64047280e5e12a8dbdecf7002f9d127b26cf1db354e9fe76"}, + {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f3b36b80380bae523e3198cfb4a137867055945ce7bf10d18fe9f0284f2fb47"}, + {file = "faiss_cpu-1.8.0.post1-cp39-cp39-win_amd64.whl", hash = "sha256:4fcc67a2353f08a20c1ab955de3cde14ef3b447761b26244a5aa849c15cbc9b3"}, + {file = "faiss_cpu-1.8.0.post1.tar.gz", hash = "sha256:5686af34414678c3d49c4fa8d774df7156e9cb48d7029071e56230e74b01cc13"}, ] [package.dependencies] -numpy = "*" +numpy = ">=1.0,<2.0" +packaging = "*" [[package]] name = "filelock" -version = "3.15.1" +version = "3.15.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.15.1-py3-none-any.whl", hash = "sha256:71b3102950e91dfc1bb4209b64be4dc8854f40e5f534428d8684f953ac847fac"}, - {file = "filelock-3.15.1.tar.gz", hash = "sha256:58a2549afdf9e02e10720eaa4d4470f56386d7a6f72edd7d0596337af8ed7ad8"}, + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, ] [package.extras] docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] typing = ["typing-extensions (>=4.8)"] [[package]] name = "fsspec" -version = "2024.6.0" +version = "2024.6.1" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"}, - {file = "fsspec-2024.6.0.tar.gz", hash = "sha256:f579960a56e6d8038a9efc8f9c77279ec12e6299aa86b0769a7e9c46b94527c2"}, + {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"}, + {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"}, ] [package.extras] @@ -672,17 +673,6 @@ files = [ intel-openmp = "==2021.*" tbb = "==2021.*" -[[package]] -name = "more-itertools" -version = "10.3.0" -description = "More routines for operating on iterables, beyond itertools" -optional = false -python-versions = ">=3.8" -files = [ - {file = "more-itertools-10.3.0.tar.gz", hash = "sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463"}, - {file = "more_itertools-10.3.0-py3-none-any.whl", hash = "sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320"}, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -702,38 +692,38 @@ tests = ["pytest (>=4.6)"] [[package]] name = "mypy" -version = "1.10.0" +version = "1.10.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e36f229acfe250dc660790840916eb49726c928e8ce10fbdf90715090fe4ae02"}, + {file = "mypy-1.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:51a46974340baaa4145363b9e051812a2446cf583dfaeba124af966fa44593f7"}, + {file = "mypy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:901c89c2d67bba57aaaca91ccdb659aa3a312de67f23b9dfb059727cce2e2e0a"}, + {file = "mypy-1.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0cd62192a4a32b77ceb31272d9e74d23cd88c8060c34d1d3622db3267679a5d9"}, + {file = "mypy-1.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:a2cbc68cb9e943ac0814c13e2452d2046c2f2b23ff0278e26599224cf164e78d"}, + {file = "mypy-1.10.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bd6f629b67bb43dc0d9211ee98b96d8dabc97b1ad38b9b25f5e4c4d7569a0c6a"}, + {file = "mypy-1.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a1bbb3a6f5ff319d2b9d40b4080d46cd639abe3516d5a62c070cf0114a457d84"}, + {file = "mypy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8edd4e9bbbc9d7b79502eb9592cab808585516ae1bcc1446eb9122656c6066f"}, + {file = "mypy-1.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6166a88b15f1759f94a46fa474c7b1b05d134b1b61fca627dd7335454cc9aa6b"}, + {file = "mypy-1.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:5bb9cd11c01c8606a9d0b83ffa91d0b236a0e91bc4126d9ba9ce62906ada868e"}, + {file = "mypy-1.10.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d8681909f7b44d0b7b86e653ca152d6dff0eb5eb41694e163c6092124f8246d7"}, + {file = "mypy-1.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:378c03f53f10bbdd55ca94e46ec3ba255279706a6aacaecac52ad248f98205d3"}, + {file = "mypy-1.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bacf8f3a3d7d849f40ca6caea5c055122efe70e81480c8328ad29c55c69e93e"}, + {file = "mypy-1.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:701b5f71413f1e9855566a34d6e9d12624e9e0a8818a5704d74d6b0402e66c04"}, + {file = "mypy-1.10.1-cp312-cp312-win_amd64.whl", hash = "sha256:3c4c2992f6ea46ff7fce0072642cfb62af7a2484efe69017ed8b095f7b39ef31"}, + {file = "mypy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:604282c886497645ffb87b8f35a57ec773a4a2721161e709a4422c1636ddde5c"}, + {file = "mypy-1.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37fd87cab83f09842653f08de066ee68f1182b9b5282e4634cdb4b407266bade"}, + {file = "mypy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8addf6313777dbb92e9564c5d32ec122bf2c6c39d683ea64de6a1fd98b90fe37"}, + {file = "mypy-1.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5cc3ca0a244eb9a5249c7c583ad9a7e881aa5d7b73c35652296ddcdb33b2b9c7"}, + {file = "mypy-1.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:1b3a2ffce52cc4dbaeee4df762f20a2905aa171ef157b82192f2e2f368eec05d"}, + {file = "mypy-1.10.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fe85ed6836165d52ae8b88f99527d3d1b2362e0cb90b005409b8bed90e9059b3"}, + {file = "mypy-1.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c2ae450d60d7d020d67ab440c6e3fae375809988119817214440033f26ddf7bf"}, + {file = "mypy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6be84c06e6abd72f960ba9a71561c14137a583093ffcf9bbfaf5e613d63fa531"}, + {file = "mypy-1.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2189ff1e39db399f08205e22a797383613ce1cb0cb3b13d8bcf0170e45b96cc3"}, + {file = "mypy-1.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:97a131ee36ac37ce9581f4220311247ab6cba896b4395b9c87af0675a13a755f"}, + {file = "mypy-1.10.1-py3-none-any.whl", hash = "sha256:71d8ac0b906354ebda8ef1673e5fde785936ac1f29ff6987c7483cfbd5a4235a"}, + {file = "mypy-1.10.1.tar.gz", hash = "sha256:1f8f492d7db9e3593ef42d4f115f04e556130f2819ad33ab84551403e97dd4c0"}, ] [package.dependencies] @@ -954,13 +944,13 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.5.40" +version = "12.5.82" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] [[package]] @@ -976,13 +966,13 @@ files = [ [[package]] name = "openai" -version = "1.34.0" +version = "1.35.7" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.34.0-py3-none-any.whl", hash = "sha256:018623c2f795424044675c6230fa3bfbf98d9e0aab45d8fd116f2efb2cfb6b7e"}, - {file = "openai-1.34.0.tar.gz", hash = "sha256:95c8e2da4acd6958e626186957d656597613587195abd0fb2527566a93e76770"}, + {file = "openai-1.35.7-py3-none-any.whl", hash = "sha256:3d1e0b0aac9b0db69a972d36dc7efa7563f8e8d65550b27a48f2a0c2ec207e80"}, + {file = "openai-1.35.7.tar.gz", hash = "sha256:009bfa1504c9c7ef64d87be55936d142325656bbc6d98c68b669d6472e4beb09"}, ] [package.dependencies] @@ -1059,109 +1049,121 @@ virtualenv = ">=20.10.0" [[package]] name = "pydantic" -version = "2.7.4" +version = "2.8.0" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, - {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, + {file = "pydantic-2.8.0-py3-none-any.whl", hash = "sha256:ead4f3a1e92386a734ca1411cb25d94147cf8778ed5be6b56749047676d6364e"}, + {file = "pydantic-2.8.0.tar.gz", hash = "sha256:d970ffb9d030b710795878940bd0489842c638e7252fc4a19c3ae2f7da4d6141"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.4" -typing-extensions = ">=4.6.1" +pydantic-core = "2.20.0" +typing-extensions = [ + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, +] [package.extras] email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.4" +version = "2.20.0" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, - {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, - {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, - {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, - {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, - {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, - {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, - {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, - {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, - {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, - {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, - {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, - {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, - {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, + {file = "pydantic_core-2.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e9dcd7fb34f7bfb239b5fa420033642fff0ad676b765559c3737b91f664d4fa9"}, + {file = "pydantic_core-2.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:649a764d9b0da29816889424697b2a3746963ad36d3e0968784ceed6e40c6355"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7701df088d0b05f3460f7ba15aec81ac8b0fb5690367dfd072a6c38cf5b7fdb5"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ab760f17c3e792225cdaef31ca23c0aea45c14ce80d8eff62503f86a5ab76bff"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cb1ad5b4d73cde784cf64580166568074f5ccd2548d765e690546cff3d80937d"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b81ec2efc04fc1dbf400647d4357d64fb25543bae38d2d19787d69360aad21c9"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4a9732a5cad764ba37f3aa873dccb41b584f69c347a57323eda0930deec8e10"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dc85b9e10cc21d9c1055f15684f76fa4facadddcb6cd63abab702eb93c98943"}, + {file = "pydantic_core-2.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:21d9f7e24f63fdc7118e6cc49defaab8c1d27570782f7e5256169d77498cf7c7"}, + {file = "pydantic_core-2.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8b315685832ab9287e6124b5d74fc12dda31e6421d7f6b08525791452844bc2d"}, + {file = "pydantic_core-2.20.0-cp310-none-win32.whl", hash = "sha256:c3dc8ec8b87c7ad534c75b8855168a08a7036fdb9deeeed5705ba9410721c84d"}, + {file = "pydantic_core-2.20.0-cp310-none-win_amd64.whl", hash = "sha256:85770b4b37bb36ef93a6122601795231225641003e0318d23c6233c59b424279"}, + {file = "pydantic_core-2.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:58e251bb5a5998f7226dc90b0b753eeffa720bd66664eba51927c2a7a2d5f32c"}, + {file = "pydantic_core-2.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:78d584caac52c24240ef9ecd75de64c760bbd0e20dbf6973631815e3ef16ef8b"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5084ec9721f82bef5ff7c4d1ee65e1626783abb585f8c0993833490b63fe1792"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6d0f52684868db7c218437d260e14d37948b094493f2646f22d3dda7229bbe3f"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1def125d59a87fe451212a72ab9ed34c118ff771e5473fef4f2f95d8ede26d75"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b34480fd6778ab356abf1e9086a4ced95002a1e195e8d2fd182b0def9d944d11"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d42669d319db366cb567c3b444f43caa7ffb779bf9530692c6f244fc635a41eb"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53b06aea7a48919a254b32107647be9128c066aaa6ee6d5d08222325f25ef175"}, + {file = "pydantic_core-2.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1f038156b696a1c39d763b2080aeefa87ddb4162c10aa9fabfefffc3dd8180fa"}, + {file = "pydantic_core-2.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3f0f3a4a23717280a5ee3ac4fb1f81d6fde604c9ec5100f7f6f987716bb8c137"}, + {file = "pydantic_core-2.20.0-cp311-none-win32.whl", hash = "sha256:316fe7c3fec017affd916a0c83d6f1ec697cbbbdf1124769fa73328e7907cc2e"}, + {file = "pydantic_core-2.20.0-cp311-none-win_amd64.whl", hash = "sha256:2d06a7fa437f93782e3f32d739c3ec189f82fca74336c08255f9e20cea1ed378"}, + {file = "pydantic_core-2.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d6f8c49657f3eb7720ed4c9b26624063da14937fc94d1812f1e04a2204db3e17"}, + {file = "pydantic_core-2.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad1bd2f377f56fec11d5cfd0977c30061cd19f4fa199bf138b200ec0d5e27eeb"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed741183719a5271f97d93bbcc45ed64619fa38068aaa6e90027d1d17e30dc8d"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d82e5ed3a05f2dcb89c6ead2fd0dbff7ac09bc02c1b4028ece2d3a3854d049ce"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2ba34a099576234671f2e4274e5bc6813b22e28778c216d680eabd0db3f7dad"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:879ae6bb08a063b3e1b7ac8c860096d8fd6b48dd9b2690b7f2738b8c835e744b"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b0eefc7633a04c0694340aad91fbfd1986fe1a1e0c63a22793ba40a18fcbdc8"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73deadd6fd8a23e2f40b412b3ac617a112143c8989a4fe265050fd91ba5c0608"}, + {file = "pydantic_core-2.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:35681445dc85446fb105943d81ae7569aa7e89de80d1ca4ac3229e05c311bdb1"}, + {file = "pydantic_core-2.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0f6dd3612a3b9f91f2e63924ea18a4476656c6d01843ca20a4c09e00422195af"}, + {file = "pydantic_core-2.20.0-cp312-none-win32.whl", hash = "sha256:7e37b6bb6e90c2b8412b06373c6978d9d81e7199a40e24a6ef480e8acdeaf918"}, + {file = "pydantic_core-2.20.0-cp312-none-win_amd64.whl", hash = "sha256:7d4df13d1c55e84351fab51383520b84f490740a9f1fec905362aa64590b7a5d"}, + {file = "pydantic_core-2.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:d43e7ab3b65e4dc35a7612cfff7b0fd62dce5bc11a7cd198310b57f39847fd6c"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b6a24d7b5893392f2b8e3b7a0031ae3b14c6c1942a4615f0d8794fdeeefb08b"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b2f13c3e955a087c3ec86f97661d9f72a76e221281b2262956af381224cfc243"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72432fd6e868c8d0a6849869e004b8bcae233a3c56383954c228316694920b38"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d70a8ff2d4953afb4cbe6211f17268ad29c0b47e73d3372f40e7775904bc28fc"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e49524917b8d3c2f42cd0d2df61178e08e50f5f029f9af1f402b3ee64574392"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4f0f71653b1c1bad0350bc0b4cc057ab87b438ff18fa6392533811ebd01439c"}, + {file = "pydantic_core-2.20.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:16197e6f4fdecb9892ed2436e507e44f0a1aa2cff3b9306d1c879ea2f9200997"}, + {file = "pydantic_core-2.20.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:763602504bf640b3ded3bba3f8ed8a1cc2fc6a87b8d55c1c5689f428c49c947e"}, + {file = "pydantic_core-2.20.0-cp313-none-win32.whl", hash = "sha256:a3f243f318bd9523277fa123b3163f4c005a3e8619d4b867064de02f287a564d"}, + {file = "pydantic_core-2.20.0-cp313-none-win_amd64.whl", hash = "sha256:03aceaf6a5adaad3bec2233edc5a7905026553916615888e53154807e404545c"}, + {file = "pydantic_core-2.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d6f2d8b8da1f03f577243b07bbdd3412eee3d37d1f2fd71d1513cbc76a8c1239"}, + {file = "pydantic_core-2.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a272785a226869416c6b3c1b7e450506152d3844207331f02f27173562c917e0"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efbb412d55a4ffe73963fed95c09ccb83647ec63b711c4b3752be10a56f0090b"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1e4f46189d8740561b43655263a41aac75ff0388febcb2c9ec4f1b60a0ec12f3"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87d3df115f4a3c8c5e4d5acf067d399c6466d7e604fc9ee9acbe6f0c88a0c3cf"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a340d2bdebe819d08f605e9705ed551c3feb97e4fd71822d7147c1e4bdbb9508"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:616b9c2f882393d422ba11b40e72382fe975e806ad693095e9a3b67c59ea6150"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:25c46bb2ff6084859bbcfdf4f1a63004b98e88b6d04053e8bf324e115398e9e7"}, + {file = "pydantic_core-2.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:23425eccef8f2c342f78d3a238c824623836c6c874d93c726673dbf7e56c78c0"}, + {file = "pydantic_core-2.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:52527e8f223ba29608d999d65b204676398009725007c9336651c2ec2d93cffc"}, + {file = "pydantic_core-2.20.0-cp38-none-win32.whl", hash = "sha256:1c3c5b7f70dd19a6845292b0775295ea81c61540f68671ae06bfe4421b3222c2"}, + {file = "pydantic_core-2.20.0-cp38-none-win_amd64.whl", hash = "sha256:8093473d7b9e908af1cef30025609afc8f5fd2a16ff07f97440fd911421e4432"}, + {file = "pydantic_core-2.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ee7785938e407418795e4399b2bf5b5f3cf6cf728077a7f26973220d58d885cf"}, + {file = "pydantic_core-2.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e75794883d635071cf6b4ed2a5d7a1e50672ab7a051454c76446ef1ebcdcc91"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:344e352c96e53b4f56b53d24728217c69399b8129c16789f70236083c6ceb2ac"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:978d4123ad1e605daf1ba5e01d4f235bcf7b6e340ef07e7122e8e9cfe3eb61ab"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c05eaf6c863781eb834ab41f5963604ab92855822a2062897958089d1335dad"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bc7e43b4a528ffca8c9151b6a2ca34482c2fdc05e6aa24a84b7f475c896fc51d"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658287a29351166510ebbe0a75c373600cc4367a3d9337b964dada8d38bcc0f4"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1dacf660d6de692fe351e8c806e7efccf09ee5184865893afbe8e59be4920b4a"}, + {file = "pydantic_core-2.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3e147fc6e27b9a487320d78515c5f29798b539179f7777018cedf51b7749e4f4"}, + {file = "pydantic_core-2.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c867230d715a3dd1d962c8d9bef0d3168994ed663e21bf748b6e3a529a129aab"}, + {file = "pydantic_core-2.20.0-cp39-none-win32.whl", hash = "sha256:22b813baf0dbf612752d8143a2dbf8e33ccb850656b7850e009bad2e101fc377"}, + {file = "pydantic_core-2.20.0-cp39-none-win_amd64.whl", hash = "sha256:3a7235b46c1bbe201f09b6f0f5e6c36b16bad3d0532a10493742f91fbdc8035f"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cafde15a6f7feaec2f570646e2ffc5b73412295d29134a29067e70740ec6ee20"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2aec8eeea0b08fd6bc2213d8e86811a07491849fd3d79955b62d83e32fa2ad5f"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:840200827984f1c4e114008abc2f5ede362d6e11ed0b5931681884dd41852ff1"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ea1d8b7df522e5ced34993c423c3bf3735c53df8b2a15688a2f03a7d678800"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5b8376a867047bf08910573deb95d3c8dfb976eb014ee24f3b5a61ccc5bee1b"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d08264b4460326cefacc179fc1411304d5af388a79910832835e6f641512358b"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7a3639011c2e8a9628466f616ed7fb413f30032b891898e10895a0a8b5857d6c"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05e83ce2f7eba29e627dd8066aa6c4c0269b2d4f889c0eba157233a353053cea"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:603a843fea76a595c8f661cd4da4d2281dff1e38c4a836a928eac1a2f8fe88e4"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ac76f30d5d3454f4c28826d891fe74d25121a346c69523c9810ebba43f3b1cec"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e3b1d4b1b3f6082849f9b28427ef147a5b46a6132a3dbaf9ca1baa40c88609"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2761f71faed820e25ec62eacba670d1b5c2709bb131a19fcdbfbb09884593e5a"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a0586cddbf4380e24569b8a05f234e7305717cc8323f50114dfb2051fcbce2a3"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:b8c46a8cf53e849eea7090f331ae2202cd0f1ceb090b00f5902c423bd1e11805"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b4a085bd04af7245e140d1b95619fe8abb445a3d7fdf219b3f80c940853268ef"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:116b326ac82c8b315e7348390f6d30bcfe6e688a7d3f1de50ff7bcc2042a23c2"}, + {file = "pydantic_core-2.20.0.tar.gz", hash = "sha256:366be8e64e0cb63d87cf79b4e1765c0703dd6313c729b22e7b9e378db6b96877"}, ] [package.dependencies] @@ -1524,64 +1526,64 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.30" +version = "2.0.31" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b48154678e76445c7ded1896715ce05319f74b1e73cf82d4f8b59b46e9c0ddc"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2753743c2afd061bb95a61a51bbb6a1a11ac1c44292fad898f10c9839a7f75b2"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7bfc726d167f425d4c16269a9a10fe8630ff6d14b683d588044dcef2d0f6be7"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4f61ada6979223013d9ab83a3ed003ded6959eae37d0d685db2c147e9143797"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a365eda439b7a00732638f11072907c1bc8e351c7665e7e5da91b169af794af"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bba002a9447b291548e8d66fd8c96a6a7ed4f2def0bb155f4f0a1309fd2735d5"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-win32.whl", hash = "sha256:0138c5c16be3600923fa2169532205d18891b28afa817cb49b50e08f62198bb8"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-win_amd64.whl", hash = "sha256:99650e9f4cf3ad0d409fed3eec4f071fadd032e9a5edc7270cd646a26446feeb"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:955991a09f0992c68a499791a753523f50f71a6885531568404fa0f231832aa0"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f69e4c756ee2686767eb80f94c0125c8b0a0b87ede03eacc5c8ae3b54b99dc46"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69c9db1ce00e59e8dd09d7bae852a9add716efdc070a3e2068377e6ff0d6fdaa"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1429a4b0f709f19ff3b0cf13675b2b9bfa8a7e79990003207a011c0db880a13"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:efedba7e13aa9a6c8407c48facfdfa108a5a4128e35f4c68f20c3407e4376aa9"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16863e2b132b761891d6c49f0a0f70030e0bcac4fd208117f6b7e053e68668d0"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-win32.whl", hash = "sha256:2ecabd9ccaa6e914e3dbb2aa46b76dede7eadc8cbf1b8083c94d936bcd5ffb49"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl", hash = "sha256:0b3f4c438e37d22b83e640f825ef0f37b95db9aa2d68203f2c9549375d0b2260"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5a79d65395ac5e6b0c2890935bad892eabb911c4aa8e8015067ddb37eea3d56c"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9a5baf9267b752390252889f0c802ea13b52dfee5e369527da229189b8bd592e"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cb5a646930c5123f8461f6468901573f334c2c63c795b9af350063a736d0134"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:296230899df0b77dec4eb799bcea6fbe39a43707ce7bb166519c97b583cfcab3"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c62d401223f468eb4da32627bffc0c78ed516b03bb8a34a58be54d618b74d472"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3b69e934f0f2b677ec111b4d83f92dc1a3210a779f69bf905273192cf4ed433e"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-win32.whl", hash = "sha256:77d2edb1f54aff37e3318f611637171e8ec71472f1fdc7348b41dcb226f93d90"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-win_amd64.whl", hash = "sha256:b6c7ec2b1f4969fc19b65b7059ed00497e25f54069407a8701091beb69e591a5"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a8e3b0a7e09e94be7510d1661339d6b52daf202ed2f5b1f9f48ea34ee6f2d57"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b60203c63e8f984df92035610c5fb76d941254cf5d19751faab7d33b21e5ddc0"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1dc3eabd8c0232ee8387fbe03e0a62220a6f089e278b1f0aaf5e2d6210741ad"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:40ad017c672c00b9b663fcfcd5f0864a0a97828e2ee7ab0c140dc84058d194cf"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e42203d8d20dc704604862977b1470a122e4892791fe3ed165f041e4bf447a1b"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-win32.whl", hash = "sha256:2a4f4da89c74435f2bc61878cd08f3646b699e7d2eba97144030d1be44e27584"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-win_amd64.whl", hash = "sha256:b6bf767d14b77f6a18b6982cbbf29d71bede087edae495d11ab358280f304d8e"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc0c53579650a891f9b83fa3cecd4e00218e071d0ba00c4890f5be0c34887ed3"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:311710f9a2ee235f1403537b10c7687214bb1f2b9ebb52702c5aa4a77f0b3af7"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:408f8b0e2c04677e9c93f40eef3ab22f550fecb3011b187f66a096395ff3d9fd"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37a4b4fb0dd4d2669070fb05b8b8824afd0af57587393015baee1cf9890242d9"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a943d297126c9230719c27fcbbeab57ecd5d15b0bd6bfd26e91bfcfe64220621"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0a089e218654e740a41388893e090d2e2c22c29028c9d1353feb38638820bbeb"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-win32.whl", hash = "sha256:fa561138a64f949f3e889eb9ab8c58e1504ab351d6cf55259dc4c248eaa19da6"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-win_amd64.whl", hash = "sha256:7d74336c65705b986d12a7e337ba27ab2b9d819993851b140efdf029248e818e"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ae8c62fe2480dd61c532ccafdbce9b29dacc126fe8be0d9a927ca3e699b9491a"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2383146973a15435e4717f94c7509982770e3e54974c71f76500a0136f22810b"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8409de825f2c3b62ab15788635ccaec0c881c3f12a8af2b12ae4910a0a9aeef6"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0094c5dc698a5f78d3d1539853e8ecec02516b62b8223c970c86d44e7a80f6c7"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:edc16a50f5e1b7a06a2dcc1f2205b0b961074c123ed17ebda726f376a5ab0953"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f7703c2010355dd28f53deb644a05fc30f796bd8598b43f0ba678878780b6e4c"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-win32.whl", hash = "sha256:1f9a727312ff6ad5248a4367358e2cf7e625e98b1028b1d7ab7b806b7d757513"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-win_amd64.whl", hash = "sha256:a0ef36b28534f2a5771191be6edb44cc2673c7b2edf6deac6562400288664221"}, - {file = "SQLAlchemy-2.0.30-py3-none-any.whl", hash = "sha256:7108d569d3990c71e26a42f60474b4c02c8586c4681af5fd67e51a044fdea86a"}, - {file = "SQLAlchemy-2.0.30.tar.gz", hash = "sha256:2b1708916730f4830bc69d6f49d37f7698b5bd7530aca7f04f785f8849e95255"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f2a213c1b699d3f5768a7272de720387ae0122f1becf0901ed6eaa1abd1baf6c"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9fea3d0884e82d1e33226935dac990b967bef21315cbcc894605db3441347443"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3ad7f221d8a69d32d197e5968d798217a4feebe30144986af71ada8c548e9fa"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f2bee229715b6366f86a95d497c347c22ddffa2c7c96143b59a2aa5cc9eebbc"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cd5b94d4819c0c89280b7c6109c7b788a576084bf0a480ae17c227b0bc41e109"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:750900a471d39a7eeba57580b11983030517a1f512c2cb287d5ad0fcf3aebd58"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-win32.whl", hash = "sha256:7bd112be780928c7f493c1a192cd8c5fc2a2a7b52b790bc5a84203fb4381c6be"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-win_amd64.whl", hash = "sha256:5a48ac4d359f058474fadc2115f78a5cdac9988d4f99eae44917f36aa1476327"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f68470edd70c3ac3b6cd5c2a22a8daf18415203ca1b036aaeb9b0fb6f54e8298"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e2c38c2a4c5c634fe6c3c58a789712719fa1bf9b9d6ff5ebfce9a9e5b89c1ca"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd15026f77420eb2b324dcb93551ad9c5f22fab2c150c286ef1dc1160f110203"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2196208432deebdfe3b22185d46b08f00ac9d7b01284e168c212919891289396"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:352b2770097f41bff6029b280c0e03b217c2dcaddc40726f8f53ed58d8a85da4"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56d51ae825d20d604583f82c9527d285e9e6d14f9a5516463d9705dab20c3740"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-win32.whl", hash = "sha256:6e2622844551945db81c26a02f27d94145b561f9d4b0c39ce7bfd2fda5776dac"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-win_amd64.whl", hash = "sha256:ccaf1b0c90435b6e430f5dd30a5aede4764942a695552eb3a4ab74ed63c5b8d3"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3b74570d99126992d4b0f91fb87c586a574a5872651185de8297c6f90055ae42"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f77c4f042ad493cb8595e2f503c7a4fe44cd7bd59c7582fd6d78d7e7b8ec52c"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd1591329333daf94467e699e11015d9c944f44c94d2091f4ac493ced0119449"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74afabeeff415e35525bf7a4ecdab015f00e06456166a2eba7590e49f8db940e"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b9c01990d9015df2c6f818aa8f4297d42ee71c9502026bb074e713d496e26b67"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:66f63278db425838b3c2b1c596654b31939427016ba030e951b292e32b99553e"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-win32.whl", hash = "sha256:0b0f658414ee4e4b8cbcd4a9bb0fd743c5eeb81fc858ca517217a8013d282c96"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-win_amd64.whl", hash = "sha256:fa4b1af3e619b5b0b435e333f3967612db06351217c58bfb50cee5f003db2a5a"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f43e93057cf52a227eda401251c72b6fbe4756f35fa6bfebb5d73b86881e59b0"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d337bf94052856d1b330d5fcad44582a30c532a2463776e1651bd3294ee7e58b"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c06fb43a51ccdff3b4006aafee9fcf15f63f23c580675f7734245ceb6b6a9e05"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:b6e22630e89f0e8c12332b2b4c282cb01cf4da0d26795b7eae16702a608e7ca1"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:79a40771363c5e9f3a77f0e28b3302801db08040928146e6808b5b7a40749c88"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-win32.whl", hash = "sha256:501ff052229cb79dd4c49c402f6cb03b5a40ae4771efc8bb2bfac9f6c3d3508f"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-win_amd64.whl", hash = "sha256:597fec37c382a5442ffd471f66ce12d07d91b281fd474289356b1a0041bdf31d"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:dc6d69f8829712a4fd799d2ac8d79bdeff651c2301b081fd5d3fe697bd5b4ab9"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:23b9fbb2f5dd9e630db70fbe47d963c7779e9c81830869bd7d137c2dc1ad05fb"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a21c97efcbb9f255d5c12a96ae14da873233597dfd00a3a0c4ce5b3e5e79704"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26a6a9837589c42b16693cf7bf836f5d42218f44d198f9343dd71d3164ceeeac"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc251477eae03c20fae8db9c1c23ea2ebc47331bcd73927cdcaecd02af98d3c3"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2fd17e3bb8058359fa61248c52c7b09a97cf3c820e54207a50af529876451808"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-win32.whl", hash = "sha256:c76c81c52e1e08f12f4b6a07af2b96b9b15ea67ccdd40ae17019f1c373faa227"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-win_amd64.whl", hash = "sha256:4b600e9a212ed59355813becbcf282cfda5c93678e15c25a0ef896b354423238"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b6cf796d9fcc9b37011d3f9936189b3c8074a02a4ed0c0fbbc126772c31a6d4"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78fe11dbe37d92667c2c6e74379f75746dc947ee505555a0197cfba9a6d4f1a4"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fc47dc6185a83c8100b37acda27658fe4dbd33b7d5e7324111f6521008ab4fe"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a41514c1a779e2aa9a19f67aaadeb5cbddf0b2b508843fcd7bafdf4c6864005"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:afb6dde6c11ea4525318e279cd93c8734b795ac8bb5dda0eedd9ebaca7fa23f1"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3f9faef422cfbb8fd53716cd14ba95e2ef655400235c3dfad1b5f467ba179c8c"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-win32.whl", hash = "sha256:fc6b14e8602f59c6ba893980bea96571dd0ed83d8ebb9c4479d9ed5425d562e9"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-win_amd64.whl", hash = "sha256:3cb8a66b167b033ec72c3812ffc8441d4e9f5f78f5e31e54dcd4c90a4ca5bebc"}, + {file = "SQLAlchemy-2.0.31-py3-none-any.whl", hash = "sha256:69f3e3c08867a8e4856e92d7afb618b95cdee18e0bc1647b77599722c9a28911"}, + {file = "SQLAlchemy-2.0.31.tar.gz", hash = "sha256:b607489dd4a54de56984a0c7656247504bd5523d9d0ba799aef59d4add009484"}, ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version < \"3.13\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} typing-extensions = ">=4.6.0" [package.extras] @@ -1625,15 +1627,15 @@ mpmath = ">=1.1.0,<1.4.0" [[package]] name = "tbb" -version = "2021.12.0" +version = "2021.13.0" description = "Intel® oneAPI Threading Building Blocks (oneTBB)" optional = false python-versions = "*" files = [ - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"}, - {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"}, - {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"}, - {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"}, + {file = "tbb-2021.13.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:a2567725329639519d46d92a2634cf61e76601dac2f777a05686fea546c4fe4f"}, + {file = "tbb-2021.13.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:aaf667e92849adb012b8874d6393282afc318aca4407fc62f912ee30a22da46a"}, + {file = "tbb-2021.13.0-py3-none-win32.whl", hash = "sha256:6669d26703e9943f6164c6407bd4a237a45007e79b8d3832fe6999576eaaa9ef"}, + {file = "tbb-2021.13.0-py3-none-win_amd64.whl", hash = "sha256:3528a53e4bbe64b07a6112b4c5a00ff3c61924ee46c9c68e004a1ac7ad1f09c3"}, ] [[package]] @@ -1892,19 +1894,19 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.41.2" +version = "4.42.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" files = [ - {file = "transformers-4.41.2-py3-none-any.whl", hash = "sha256:05555d20e43f808de1ef211ab64803cdb513170cef70d29a888b589caebefc67"}, - {file = "transformers-4.41.2.tar.gz", hash = "sha256:80a4db216533d573e9cc7388646c31ed9480918feb7c55eb211249cb23567f87"}, + {file = "transformers-4.42.3-py3-none-any.whl", hash = "sha256:a61a0df9609b7d69229d941b2fd857c841ba3043d6da503d0da1a4b133f65b92"}, + {file = "transformers-4.42.3.tar.gz", hash = "sha256:7539873ff45809145265cbc94ea4619d2713c41ceaa277b692d8b0be3430f7eb"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.23.0,<1.0" -numpy = ">=1.17" +huggingface-hub = ">=0.23.2,<1.0" +numpy = ">=1.17,<2.0" packaging = ">=20.0" pyyaml = ">=5.1" regex = "!=2019.12.17" @@ -1916,14 +1918,15 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.21.0)"] agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +benchmark = ["optimum-benchmark (>=0.2.0)"] codecarbon = ["codecarbon (==1.2.0)"] deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.19,<0.20)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.4.4)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.4.4)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.4.4)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.19,<0.20)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.4.4)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] @@ -1934,25 +1937,26 @@ natten = ["natten (>=0.14.6,<0.15.0)"] onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] optuna = ["optuna"] -quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"] +quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "ruff (==0.4.4)", "urllib3 (<2.0.0)"] ray = ["ray[tune] (>=2.7.0)"] retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] +ruff = ["ruff (==0.4.4)"] sagemaker = ["sagemaker (>=2.31.0)"] sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] serving = ["fastapi", "pydantic", "starlette", "uvicorn"] sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.4.4)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] -tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] +tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] -timm = ["timm"] +timm = ["timm (<=0.9.16)"] tokenizers = ["tokenizers (>=0.19,<0.20)"] torch = ["accelerate (>=0.21.0)", "torch"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.19,<0.20)", "torch", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17,<2.0)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.19,<0.20)", "torch", "tqdm (>=4.27)"] video = ["av (==9.2.0)", "decord (==0.6.0)"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -1979,6 +1983,17 @@ build = ["cmake (>=3.20)", "lit"] tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"] tutorials = ["matplotlib", "pandas", "tabulate", "torch"] +[[package]] +name = "types-pyyaml" +version = "6.0.12.20240311" +description = "Typing stubs for PyYAML" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-PyYAML-6.0.12.20240311.tar.gz", hash = "sha256:a9e0f0f88dc835739b0c1ca51ee90d04ca2a897a71af79de9aec5f38cb0a5342"}, + {file = "types_PyYAML-6.0.12.20240311-py3-none-any.whl", hash = "sha256:b845b06a1c7e54b8e5b4c683043de0d9caf205e7434b3edc678ff2411979b8f6"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1992,13 +2007,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.1" +version = "2.2.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, - {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, ] [package.extras] @@ -2009,13 +2024,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.26.2" +version = "20.26.3" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"}, - {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"}, + {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, + {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, ] [package.dependencies] @@ -2030,4 +2045,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = ">=3.10, <4.0" -content-hash = "13aa6f3ccac9da8212c4349dc75b739ce69f3ad0d75a135d5cde71bc16212e17" +content-hash = "0dddba6a92d03b56548f5b46cf72c6ce65920ff634df463dbfe336262b755126" diff --git a/lightrag/pyproject.toml b/lightrag/pyproject.toml index facac7a9..2fc49ce9 100644 --- a/lightrag/pyproject.toml +++ b/lightrag/pyproject.toml @@ -1,27 +1,35 @@ [tool.poetry] name = "lightrag" -packages = [ - { include = "core", from = "." }, - { include = "components", from = "." }, - { include = "eval", from = "." }, - { include = "utils", from = "." }, - { include = "tracing", from = "." }, - { include = "optim", from = "." }, - { include = "icl", from = "." }, -] -version = "0.1.0" +version = "0.0.0-alpha.7" description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator." -authors = ["Li Yin "] +authors = ["Li Yin "] readme = "README.md" +repository = "https://github.com/SylphAI-Inc/LightRAG" + license = "MIT" +maintainers = ["Xiaoyi Gu ", "Li Yin "] classifiers = [ "Topic :: Software Development :: Build Tools", "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Application Frameworks", ] +keywords = ["LLM", "NLP", "RAG", "devtools", "retrieval", "agent"] + +include = [ + "lightrag/core/*", + "lightrag/components/*", + "lightrag/eval/*", + "lightrag/utils/*", + "lightrag/tracing/*", + "lightrag/optim/*", + # "lightrag/icl/*", +] + + +packages = [{ include = "lightrag", from = "." }] -# [[tool.poetry.packages]] -# include = "lightrag" [tool.poetry.dependencies] python = ">=3.10, <4.0" @@ -33,15 +41,6 @@ jsonlines = "^4.0.0" tiktoken = "^0.7.0" numpy = "^1.26.4" -# TODO: decide if we need people to install faiss, or openai, or groq separatelypython = ">=3.11, <4.0" - -openai = "^1.12.0" -groq = "^0.5.0" # should only be installed if groq client is used -faiss-cpu = "^1.8.0" -# matplotlib = "^3.8.4" -# colorama = "^0.4.6" -more-itertools = "^10.3.0" - [tool.poetry.group.test.dependencies] pytest = "^8.1.1" @@ -51,17 +50,14 @@ torch = "^2.3.1" [tool.poetry.group.typing.dependencies] mypy = "^1" +types-pyyaml = "^6.0.12.20240311" # for mypy + [tool.poetry.group.dev.dependencies] pre-commit = "^3.7.0" - - -# [tool.poetry.group.doc.dependencies] -# datasets = ">=2.14.6, <=2.19.1" -# sphinx = "^7.3.7" -# sphinx-rtd-theme = "^2.0.0" -# pydata-sphinx-theme = "0.15.2" -# sphinx-design = "^0.6.0" +openai = "^1.12.0" +groq = "^0.5.0" # should only be installed if groq client is used +faiss-cpu = "^1.8.0" sqlalchemy = "^2.0.30" diff --git a/lightrag/pytest.ini b/lightrag/pytest.ini index 70f8b746..fa2e4857 100644 --- a/lightrag/pytest.ini +++ b/lightrag/pytest.ini @@ -4,4 +4,3 @@ python_files = test_*.py python_classes = Test* python_functions = test_* norecursedirs = *_test -addopts = --ignore=li_test/test_li_datasets.py --ignore=li_test/test_li_dspy.py --ignore=li_test/test_li_haystack.py --ignore=li_test/test_li_llamaindex_huggingface_llm.py --ignore=li_test/test_li_llamaindex_router.py --ignore=li_test/test_li_ollama.py --ignore=li_test/test_li_transformer.py --ignore=li_test/test_li_transformers_small_model.py diff --git a/lightrag/tests/test_base_data_class.py b/lightrag/tests/test_base_data_class.py index 261d76c2..4d0db965 100644 --- a/lightrag/tests/test_base_data_class.py +++ b/lightrag/tests/test_base_data_class.py @@ -1,48 +1,221 @@ import unittest -from lightrag.core import DataClass -from dataclasses import field, MISSING, dataclass +from lightrag.core import DataClass, required_field +from dataclasses import field, MISSING, dataclass, asdict -# Assuming DataClass is in your_module and correctly imported +from typing import List, Dict, Optional, Set + + +# simple class, both fields are required @dataclass class MyOutputs(DataClass): age: int = field( default=MISSING, metadata={"desc": "The age of the person", "prefix": "Age:"} ) name: str = field( - default=MISSING, metadata={"desc": "The name of the person", "prefix": "Name:"} + metadata={ + "desc": "The name of the person", + "prefix": "Name:", + }, # will make it a required field + ) + + +@dataclass +class Address: + street: str + city: str + zipcode: str + + +# Example instance of the nested dataclasses and complex data types as list, dict +@dataclass +class Person(DataClass): + name: Optional[str] = field( + metadata={"desc": "The name of the person"}, default=None + ) + age: int = field( + metadata={"desc": "The age of the person"}, + default_factory=required_field(), # customized behavior to allow required fields after optional fields + ) + addresses: List[Address] = field( + default_factory=list, metadata={"desc": "The list of addresses"} + ) + single_address: Address = field( + default=None, metadata={"desc": "The single address"} + ) + dict_addresses: Dict[str, Address] = field(default_factory=dict) + set_hobbies: Set[int] = field( + metadata={"desc": "The set of hobbies"}, default_factory=required_field() ) class TestBaseDataClass(unittest.TestCase): + # setup + def setUp(self): + self.person_instance = Person( + name="John Doe", + age=30, + addresses=[ + Address(street="123 Main St", city="Anytown", zipcode="12345"), + Address(street="456 Elm St", city="Othertown", zipcode="67890"), + ], + single_address=Address( + street="123 Main St", city="Anytown", zipcode="12345" + ), + dict_addresses={ + "home": Address(street="123 Main St", city="Anytown", zipcode="12345"), + "work": Address(street="456 Elm St", city="Othertown", zipcode="67890"), + }, + set_hobbies={1, 2, 3}, + ) + self.output_instance = MyOutputs(age=25, name="John Doe") def test_to_dict_instance(self): """Test the to_dict method on an instance of the dataclass.""" - instance = MyOutputs(age=25, name="John Doe") expected_result = {"age": 25, "name": "John Doe"} - self.assertEqual(instance.to_dict(), expected_result) + instance_dict = self.output_instance.to_dict() + print(f"instance_dict: {instance_dict}") + print(f"instance dict from asdict: {asdict(self.output_instance)}") + self.assertEqual(instance_dict, expected_result) + + # test from_dict + reconstructed_instance = MyOutputs.from_dict(instance_dict) + self.assertEqual(reconstructed_instance, self.output_instance) + + # test with nested dataclass + expected_result = { + "name": "John Doe", + "age": 30, + "addresses": [ + {"street": "123 Main St", "city": "Anytown", "zipcode": "12345"}, + {"street": "456 Elm St", "city": "Othertown", "zipcode": "67890"}, + ], + "single_address": { + "street": "123 Main St", + "city": "Anytown", + "zipcode": "12345", + }, + "dict_addresses": { + "home": { + "street": "123 Main St", + "city": "Anytown", + "zipcode": "12345", + }, + "work": { + "street": "456 Elm St", + "city": "Othertown", + "zipcode": "67890", + }, + }, + "set_hobbies": {1, 2, 3}, + } + instance_dict = self.person_instance.to_dict() + print(f"instance_dict: {instance_dict}") + print(f"instance dict from asdict: {asdict(self.person_instance)}") + self.assertEqual(instance_dict, expected_result) + self.assertEqual(asdict(self.person_instance), expected_result) + + # test from_dict + reconstructed_instance = Person.from_dict(instance_dict) + print(f"original_instance: {self.person_instance}") + print(f"reconstructed_instance: {reconstructed_instance}") + self.assertEqual(reconstructed_instance, self.person_instance) + + def test_to_dict_class_nested(self): + """Test the to_dict method on an instance of the dataclass with nested""" + expected_result = { + "type": "Person", + "properties": { + "name": {"type": "Optional[str]", "desc": "The name of the person"}, + "age": {"type": "int", "desc": "The age of the person"}, + "addresses": { + "type": "List[{'type': 'Address', 'properties': {'street': {'type': 'str'}, 'city': {'type': 'str'}, 'zipcode': {'type': 'str'}}, 'required': ['street', 'city', 'zipcode']}]", + "desc": "The list of addresses", + }, + "single_address": { + "type": "{'type': 'Address', 'properties': {'street': {'type': 'str'}, 'city': {'type': 'str'}, 'zipcode': {'type': 'str'}}, 'required': ['street', 'city', 'zipcode']}", + "desc": "The single address", + }, + "dict_addresses": { + "type": "Dict[str, {'type': 'Address', 'properties': {'street': {'type': 'str'}, 'city': {'type': 'str'}, 'zipcode': {'type': 'str'}}, 'required': ['street', 'city', 'zipcode']}]" + }, + "set_hobbies": { + "type": "Set[int]", + "desc": "The set of hobbies", + }, + }, + "required": ["age", "set_hobbies"], + } + + person_dict_class = Person.to_dict_class() + print(f"person_dict_class: {person_dict_class}") + self.assertEqual(person_dict_class, expected_result) def test_to_dict_instance_with_exclusion(self): """Test the to_dict method with field exclusion on an instance.""" - instance = MyOutputs(age=25, name="John Doe") + output = self.output_instance.to_dict(exclude=["age"]) + print(f"output: {output}") expected_result = {"name": "John Doe"} - self.assertEqual(instance.to_dict(exclude=["age"]), expected_result) + self.assertEqual(output, expected_result) def test_to_dict_class(self): """Test the to_dict method on the class itself.""" expected_result = { - "age": {"type": "int", "desc": "The age of the person", "required": True}, - "name": {"type": "str", "desc": "The name of the person", "required": True}, + "type": "MyOutputs", + "properties": { + "age": { + "type": "int", + "desc": "The age of the person", + "prefix": "Age:", + }, + "name": { + "type": "str", + "desc": "The name of the person", + "prefix": "Name:", + }, + }, + "required": ["age", "name"], } - self.assertEqual(MyOutputs.to_dict_class(), expected_result) + output = MyOutputs.to_dict_class() + self.assertEqual(output, expected_result) def test_to_dict_class_with_exclusion(self): """Test the to_dict method with field exclusion on the class.""" exclude = ["age"] expected_result = { - "name": {"type": "str", "desc": "The name of the person", "required": True} + "type": "MyOutputs", + "properties": { + "name": { + "type": "str", + "desc": "The name of the person", + "prefix": "Name:", + }, + }, + "required": ["name"], + } + output = MyOutputs.to_dict_class(exclude=exclude) + self.assertEqual(output, expected_result) + + # on Person class + exclude = {"Person": ["addresses", "set_hobbies"], "Address": ["city"]} + expected_result = { + "type": "Person", + "properties": { + "name": {"type": "Optional[str]", "desc": "The name of the person"}, + "age": {"type": "int", "desc": "The age of the person"}, + "single_address": { + "type": "{'type': 'Address', 'properties': {'street': {'type': 'str'}, 'zipcode': {'type': 'str'}}, 'required': ['street', 'zipcode']}", + "desc": "The single address", + }, + "dict_addresses": { + "type": "Dict[str, {'type': 'Address', 'properties': {'street': {'type': 'str'}, 'zipcode': {'type': 'str'}}, 'required': ['street', 'zipcode']}]" + }, + }, + "required": ["age"], } - self.assertEqual(MyOutputs.to_dict_class(exclude=exclude), expected_result) + output = Person.to_dict_class(exclude=exclude) + print(f"output 1: {output}") + # self.assertEqual(output, expected_result) def test_error_non_dataclass(self): """Test error handling when to_dict is called on a non-dataclass.""" diff --git a/lightrag/tests/test_data_classes.py b/lightrag/tests/test_data_classes.py index 1147fdef..80dfbd87 100644 --- a/lightrag/tests/test_data_classes.py +++ b/lightrag/tests/test_data_classes.py @@ -5,7 +5,7 @@ UserQuery, AssistantResponse, DialogTurn, - DialogSession, + Conversation, ) @@ -37,7 +37,7 @@ def test_dialog_turn_creation(): def test_dialog_session_operations(): - session = DialogSession() + session = Conversation() assert isinstance(session.id, str) # Check if the UUID is automatically generated # Creating dialog turns diff --git a/lightrag/tests/test_dataclass_object_functions.py b/lightrag/tests/test_dataclass_object_functions.py new file mode 100644 index 00000000..60640c29 --- /dev/null +++ b/lightrag/tests/test_dataclass_object_functions.py @@ -0,0 +1,747 @@ +import unittest +from dataclasses import dataclass +from typing import List, Dict +from collections import OrderedDict + +from lightrag.core.functional import custom_asdict, dataclass_obj_from_dict +from lightrag.core.base_data_class import DataClass + + +# Define test dataclasses +@dataclass +class SimpleData(DataClass): + name: str + age: int + score: float + + +@dataclass +class NestedData(DataClass): + simple: SimpleData + description: str + + +@dataclass +class ListData(DataClass): + items: List[SimpleData] + total: int + + +@dataclass +class DictData(DataClass): + mappings: Dict[str, SimpleData] + count: int + + +@dataclass +class OrderedDictData(DataClass): + ordered_mappings: OrderedDict[str, SimpleData] + count: int + + +@dataclass +class ComplexData(DataClass): + nested: NestedData + list_data: ListData + dict_data: DictData + ordered_dict_data: OrderedDictData + + +# Define the test class +class TestDataclassFuncConversion(unittest.TestCase): + + def test_simple_data(self): + simple = SimpleData(name="John", age=30, score=95.5) + simple_dict = custom_asdict(simple) + expected_dict = {"name": "John", "age": 30, "score": 95.5} + self.assertEqual(simple_dict, expected_dict) + + reconstructed_simple = dataclass_obj_from_dict(SimpleData, simple_dict) + self.assertEqual(reconstructed_simple, simple) + + def test_nested_data(self): + simple = SimpleData(name="John", age=30, score=95.5) + nested = NestedData(simple=simple, description="Test description") + nested_dict = custom_asdict(nested) + expected_dict = { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + } + self.assertEqual(nested_dict, expected_dict) + + reconstructed_nested = dataclass_obj_from_dict(NestedData, nested_dict) + self.assertEqual(reconstructed_nested, nested) + + def test_list_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + list_data = ListData(items=[simple1, simple2], total=2) + list_data_dict = custom_asdict(list_data) + expected_dict = { + "items": [ + {"name": "John", "age": 30, "score": 95.5}, + {"name": "Jane", "age": 25, "score": 88.0}, + ], + "total": 2, + } + self.assertEqual(list_data_dict, expected_dict) + + reconstructed_list_data = dataclass_obj_from_dict(ListData, list_data_dict) + self.assertEqual(reconstructed_list_data, list_data) + + def test_dict_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + dict_data_dict = custom_asdict(dict_data) + expected_dict = { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + } + self.assertEqual(dict_data_dict, expected_dict) + + reconstructed_dict_data = dataclass_obj_from_dict(DictData, dict_data_dict) + self.assertEqual(reconstructed_dict_data, dict_data) + + def test_ordered_dict_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + ordered_dict_data = OrderedDictData( + ordered_mappings=OrderedDict([("first", simple1), ("second", simple2)]), + count=2, + ) + ordered_dict_data_dict = custom_asdict(ordered_dict_data) + expected_dict = { + "ordered_mappings": OrderedDict( + [ + ("first", {"name": "John", "age": 30, "score": 95.5}), + ("second", {"name": "Jane", "age": 25, "score": 88.0}), + ] + ), + "count": 2, + } + self.assertEqual(ordered_dict_data_dict, expected_dict) + + reconstructed_ordered_dict_data = dataclass_obj_from_dict( + OrderedDictData, ordered_dict_data_dict + ) + self.assertEqual(reconstructed_ordered_dict_data, ordered_dict_data) + + def test_complex_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + nested = NestedData(simple=simple1, description="Test description") + list_data = ListData(items=[simple1, simple2], total=2) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + ordered_dict_data = OrderedDictData( + ordered_mappings=OrderedDict([("first", simple1), ("second", simple2)]), + count=2, + ) + complex_data = ComplexData( + nested=nested, + list_data=list_data, + dict_data=dict_data, + ordered_dict_data=ordered_dict_data, + ) + complex_data_dict = custom_asdict(complex_data) + expected_dict = { + "nested": { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + }, + "list_data": { + "items": [ + {"name": "John", "age": 30, "score": 95.5}, + {"name": "Jane", "age": 25, "score": 88.0}, + ], + "total": 2, + }, + "dict_data": { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + }, + "ordered_dict_data": { + "ordered_mappings": OrderedDict( + [ + ("first", {"name": "John", "age": 30, "score": 95.5}), + ("second", {"name": "Jane", "age": 25, "score": 88.0}), + ] + ), + "count": 2, + }, + } + self.assertEqual(complex_data_dict, expected_dict) + + reconstructed_complex_data = dataclass_obj_from_dict( + ComplexData, complex_data_dict + ) + self.assertEqual(reconstructed_complex_data, complex_data) + + def test_exclude(self): + simple = SimpleData(name="John", age=30, score=95.5) + simple_dict = custom_asdict(simple, exclude={"SimpleData": ["age"]}) + expected_dict = {"name": "John", "score": 95.5} + self.assertEqual(simple_dict, expected_dict) + + +class TestDataClassBaseClassConversion(unittest.TestCase): + + def test_dict_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + dict_data_dict = dict_data.to_dict() + expected_dict = { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + } + self.assertEqual(dict_data_dict, expected_dict) + + reconstructed_dict_data = DictData.from_dict(dict_data_dict) + self.assertEqual(reconstructed_dict_data, dict_data) + + def test_complex_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + nested = NestedData(simple=simple1, description="Test description") + list_data = ListData(items=[simple1, simple2], total=2) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + ordered_dict_data = OrderedDictData( + ordered_mappings=OrderedDict([("first", simple1), ("second", simple2)]), + count=2, + ) + complex_data = ComplexData( + nested=nested, + list_data=list_data, + dict_data=dict_data, + ordered_dict_data=ordered_dict_data, + ) + complex_data_dict = complex_data.to_dict() + expected_dict = { + "nested": { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + }, + "list_data": { + "items": [ + {"name": "John", "age": 30, "score": 95.5}, + {"name": "Jane", "age": 25, "score": 88.0}, + ], + "total": 2, + }, + "dict_data": { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + }, + "ordered_dict_data": { + "ordered_mappings": OrderedDict( + [ + ("first", {"name": "John", "age": 30, "score": 95.5}), + ("second", {"name": "Jane", "age": 25, "score": 88.0}), + ] + ), + "count": 2, + }, + } + self.assertEqual(complex_data_dict, expected_dict) + + reconstructed_complex_data = ComplexData.from_dict(complex_data_dict) + self.assertEqual(reconstructed_complex_data, complex_data) + + def test_exclude(self): + simple = DictData( + mappings={"first": SimpleData(name="John", age=30, score=95.5)}, + count=1, + ) + simple_dict = simple.to_dict(exclude={"DictData": ["count"]}) + expected_dict = { + "mappings": {"first": {"name": "John", "age": 30, "score": 95.5}} + } + self.assertEqual(simple_dict, expected_dict) + + complex = ComplexData( + nested=NestedData( + simple=SimpleData(name="John", age=30, score=95.5), + description="Test description", + ), + list_data=ListData( + items=[ + SimpleData(name="John", age=30, score=95.5), + SimpleData(name="Jane", age=25, score=88.0), + ], + total=2, + ), + dict_data=simple, + ordered_dict_data=OrderedDictData( + ordered_mappings=OrderedDict( + [ + ("first", SimpleData(name="John", age=30, score=95.5)), + ("second", SimpleData(name="Jane", age=25, score=88.0)), + ] + ), + count=2, + ), + ) + complex_dict = complex.to_dict( + exclude={"ListData": ["items"], "DictData": ["count"]} + ) + expected_dict = { + "nested": { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + }, + "list_data": {"total": 2}, + "dict_data": { + "mappings": {"first": {"name": "John", "age": 30, "score": 95.5}} + }, + "ordered_dict_data": { + "ordered_mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + }, + } + self.assertEqual(complex_dict, expected_dict) + + +@dataclass +class ComplexData2(DataClass): + field1: str + field2: int + field3: float + nested: NestedData + list_data: ListData + dict_data: DictData + ordered_dict_data: OrderedDictData + + +# Define the test class +class TestDataClassYamlJsonConversion(unittest.TestCase): + + def test_simple_data(self): + simple = SimpleData(name="John", age=30, score=95.5) + simple_dict = simple.to_dict() + expected_dict = {"name": "John", "age": 30, "score": 95.5} + self.assertEqual(simple_dict, expected_dict) + + simple_json = simple.to_json() + expected_json = """{ + "name": "John", + "age": 30, + "score": 95.5 +}""" + + self.assertEqual(simple_json, expected_json) + + simple_yaml = simple.to_yaml() + expected_yaml = "name: John\nage: 30\nscore: 95.5\n" + self.assertEqual(simple_yaml, expected_yaml) + + reconstructed_simple = SimpleData.from_dict(simple_dict) + self.assertEqual(reconstructed_simple, simple) + + reconstructed_simple_json = SimpleData.from_json(simple_json) + self.assertEqual(reconstructed_simple_json, simple) + + reconstructed_simple_yaml = SimpleData.from_yaml(expected_yaml) + self.assertEqual(reconstructed_simple_yaml, simple) + + def test_nested_data(self): + simple = SimpleData(name="John", age=30, score=95.5) + nested = NestedData(simple=simple, description="Test description") + nested_dict = nested.to_dict() + expected_dict = { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + } + self.assertEqual(nested_dict, expected_dict) + + nested_json = nested.to_json() + expected_json = """{ + "simple": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "description": "Test description" +}""" + self.assertEqual(nested_json, expected_json) + + nested_yaml = nested.to_yaml() + expected_yaml = """simple: + name: John + age: 30 + score: 95.5 +description: Test description +""" + + self.assertEqual(nested_yaml, expected_yaml) + + reconstructed_nested = NestedData.from_dict(nested_dict) + self.assertEqual(reconstructed_nested, nested) + + reconstructed_nested_json = NestedData.from_json(nested_json) + self.assertEqual(reconstructed_nested_json, nested) + + reconstructed_nested_yaml = NestedData.from_yaml(expected_yaml) + self.assertEqual(reconstructed_nested_yaml, nested) + + def test_list_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + list_data = ListData(items=[simple1, simple2], total=2) + list_data_dict = list_data.to_dict() + expected_dict = { + "items": [ + {"name": "John", "age": 30, "score": 95.5}, + {"name": "Jane", "age": 25, "score": 88.0}, + ], + "total": 2, + } + self.assertEqual(list_data_dict, expected_dict) + + list_data_json = list_data.to_json() + expected_json = """{ + "items": [ + { + "name": "John", + "age": 30, + "score": 95.5 + }, + { + "name": "Jane", + "age": 25, + "score": 88.0 + } + ], + "total": 2 +}""" + self.assertEqual(list_data_json, expected_json) + + list_data_yaml = list_data.to_yaml() + expected_yaml = """items: +- name: John + age: 30 + score: 95.5 +- name: Jane + age: 25 + score: 88.0 +total: 2 +""" + self.assertEqual(list_data_yaml, expected_yaml) + + reconstructed_list_data = ListData.from_dict(list_data_dict) + self.assertEqual(reconstructed_list_data, list_data) + + reconstructed_list_data_json = ListData.from_json(list_data_json) + self.assertEqual(reconstructed_list_data_json, list_data) + + reconstructed_list_data_yaml = ListData.from_yaml(expected_yaml) + self.assertEqual(reconstructed_list_data_yaml, list_data) + + def test_dict_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + dict_data_dict = dict_data.to_dict() + expected_dict = { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + } + self.assertEqual(dict_data_dict, expected_dict) + + dict_data_json = dict_data.to_json() + expected_json = """{ + "mappings": { + "first": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "second": { + "name": "Jane", + "age": 25, + "score": 88.0 + } + }, + "count": 2 +}""" + self.assertEqual(dict_data_json, expected_json) + + dict_data_yaml = dict_data.to_yaml() + expected_yaml = """mappings: + first: + name: John + age: 30 + score: 95.5 + second: + name: Jane + age: 25 + score: 88.0 +count: 2 +""" + self.assertEqual(dict_data_yaml, expected_yaml) + + reconstructed_dict_data = DictData.from_dict(dict_data_dict) + self.assertEqual(reconstructed_dict_data, dict_data) + + reconstructed_dict_data_json = DictData.from_json(dict_data_json) + self.assertEqual(reconstructed_dict_data_json, dict_data) + + reconstructed_dict_data_yaml = DictData.from_yaml(dict_data_yaml) + self.assertEqual(reconstructed_dict_data_yaml, dict_data) + + def test_ordered_dict_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + ordered_dict_data = OrderedDictData( + ordered_mappings=OrderedDict([("first", simple1), ("second", simple2)]), + count=2, + ) + ordered_dict_data_dict = ordered_dict_data.to_dict() + expected_dict = { + "ordered_mappings": OrderedDict( + [ + ("first", {"name": "John", "age": 30, "score": 95.5}), + ("second", {"name": "Jane", "age": 25, "score": 88.0}), + ] + ), + "count": 2, + } + self.assertEqual(ordered_dict_data_dict, expected_dict) + + ordered_dict_data_json = ordered_dict_data.to_json() + expected_json = """{ + "ordered_mappings": { + "first": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "second": { + "name": "Jane", + "age": 25, + "score": 88.0 + } + }, + "count": 2 +}""" + self.assertEqual(ordered_dict_data_json, expected_json) + + ordered_dict_data_yaml = ordered_dict_data.to_yaml() + expected_yaml = """ordered_mappings: + first: + name: John + age: 30 + score: 95.5 + second: + name: Jane + age: 25 + score: 88.0 +count: 2 +""" + print(ordered_dict_data_yaml) + self.assertEqual(ordered_dict_data_yaml, expected_yaml) + + reconstructed_ordered_dict_data = OrderedDictData.from_dict( + ordered_dict_data_dict + ) + self.assertEqual(reconstructed_ordered_dict_data, ordered_dict_data) + + reconstructed_ordered_dict_data_json = OrderedDictData.from_json( + ordered_dict_data_json + ) + self.assertEqual(reconstructed_ordered_dict_data_json, ordered_dict_data) + + reconstructed_ordered_dict_data_yaml = OrderedDictData.from_yaml( + ordered_dict_data_yaml + ) + self.assertEqual(reconstructed_ordered_dict_data_yaml, ordered_dict_data) + + def test_complex_data(self): + simple1 = SimpleData(name="John", age=30, score=95.5) + simple2 = SimpleData(name="Jane", age=25, score=88.0) + nested = NestedData(simple=simple1, description="Test description") + list_data = ListData(items=[simple1, simple2], total=2) + dict_data = DictData(mappings={"first": simple1, "second": simple2}, count=2) + ordered_dict_data = OrderedDictData( + ordered_mappings=OrderedDict([("first", simple1), ("second", simple2)]), + count=2, + ) + complex_data = ComplexData2( + field1="field1_value", + field2=123, + field3=456.78, + nested=nested, + list_data=list_data, + dict_data=dict_data, + ordered_dict_data=ordered_dict_data, + ) + complex_data_dict = complex_data.to_dict() + expected_dict = { + "field1": "field1_value", + "field2": 123, + "field3": 456.78, + "nested": { + "simple": {"name": "John", "age": 30, "score": 95.5}, + "description": "Test description", + }, + "list_data": { + "items": [ + {"name": "John", "age": 30, "score": 95.5}, + {"name": "Jane", "age": 25, "score": 88.0}, + ], + "total": 2, + }, + "dict_data": { + "mappings": { + "first": {"name": "John", "age": 30, "score": 95.5}, + "second": {"name": "Jane", "age": 25, "score": 88.0}, + }, + "count": 2, + }, + "ordered_dict_data": { + "ordered_mappings": OrderedDict( + [ + ("first", {"name": "John", "age": 30, "score": 95.5}), + ("second", {"name": "Jane", "age": 25, "score": 88.0}), + ] + ), + "count": 2, + }, + } + self.assertEqual(complex_data_dict, expected_dict) + + complex_data_json = complex_data.to_json() + expected_json = """{ + "field1": "field1_value", + "field2": 123, + "field3": 456.78, + "nested": { + "simple": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "description": "Test description" + }, + "list_data": { + "items": [ + { + "name": "John", + "age": 30, + "score": 95.5 + }, + { + "name": "Jane", + "age": 25, + "score": 88.0 + } + ], + "total": 2 + }, + "dict_data": { + "mappings": { + "first": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "second": { + "name": "Jane", + "age": 25, + "score": 88.0 + } + }, + "count": 2 + }, + "ordered_dict_data": { + "ordered_mappings": { + "first": { + "name": "John", + "age": 30, + "score": 95.5 + }, + "second": { + "name": "Jane", + "age": 25, + "score": 88.0 + } + }, + "count": 2 + } +}""" + self.assertEqual(complex_data_json, expected_json) + + complex_data_yaml = complex_data.to_yaml() + expected_yaml = """field1: field1_value +field2: 123 +field3: 456.78 +nested: + simple: + name: John + age: 30 + score: 95.5 + description: Test description +list_data: + items: + - name: John + age: 30 + score: 95.5 + - name: Jane + age: 25 + score: 88.0 + total: 2 +dict_data: + mappings: + first: + name: John + age: 30 + score: 95.5 + second: + name: Jane + age: 25 + score: 88.0 + count: 2 +ordered_dict_data: + ordered_mappings: + first: + name: John + age: 30 + score: 95.5 + second: + name: Jane + age: 25 + score: 88.0 + count: 2 +""" + self.assertEqual(complex_data_yaml, expected_yaml) + + reconstructed_complex_data = ComplexData2.from_dict(complex_data_dict) + self.assertEqual(reconstructed_complex_data, complex_data) + + reconstructed_complex_data_json = ComplexData2.from_json(complex_data_json) + self.assertEqual(reconstructed_complex_data_json, complex_data) + + reconstructed_complex_data_yaml = ComplexData2.from_yaml(complex_data_yaml) + self.assertEqual(reconstructed_complex_data_yaml, complex_data) + + +if __name__ == "__main__": + unittest.main() diff --git a/lightrag/tests/test_function_expression_parse.py b/lightrag/tests/test_function_expression_parse.py new file mode 100644 index 00000000..8dcd74f9 --- /dev/null +++ b/lightrag/tests/test_function_expression_parse.py @@ -0,0 +1,183 @@ +import ast +import pytest + +from lightrag.core.functional import evaluate_ast_node, parse_function_call_expr + +from dataclasses import dataclass +import numpy as np + + +@dataclass +class Point: + x: int + y: int + + +def add_points(p1: Point, p2: Point) -> Point: + return Point(p1.x + p2.x, p1.y + p2.y) + + +def numpy_sum(arr: np.ndarray) -> int: + return np.sum(arr) + + +class TestAstEvaluation: + def setup_method(self): + def add(a, b: int) -> int: + return a + b + + self.context_map = { + "x": 10, + "y": 5, + "add": add, + "subtract": lambda a, b: a - b, + "multiply": lambda a, b: a * b, + "divide": lambda a, b: a / b, + "Point": Point, + "add_points": add_points, + "np": np, # Adding numpy to the context map + "array": np.array, + "sum": np.sum, + "mean": np.mean, + "numpy_sum": numpy_sum, + } + + def test_evaluate_constant(self): + node = ast.parse("42", mode="eval").body + assert evaluate_ast_node(node) == 42 + + def test_evaluate_dict(self): + node = ast.parse("{'a': 1, 'b': 2}", mode="eval").body + assert evaluate_ast_node(node) == {"a": 1, "b": 2} + + def test_evaluate_list(self): + node = ast.parse("[1, 2, 3]", mode="eval").body + assert evaluate_ast_node(node) == [1, 2, 3] + + def test_evaluate_tuple(self): + node = ast.parse("(1, 2, 3)", mode="eval").body + assert evaluate_ast_node(node) == (1, 2, 3) + + def test_evaluate_unary_op(self): + node = ast.parse("-10", mode="eval").body + assert evaluate_ast_node(node) == -10 + + def test_evaluate_bin_op(self): + node = ast.parse("10 + 5", mode="eval").body + assert evaluate_ast_node(node) == 15 + + def test_evaluate_name(self): + node = ast.parse("x", mode="eval").body + assert evaluate_ast_node(node, self.context_map) == 10 + + def test_evaluate_function_call(self): + node = ast.parse("add(3, 4)", mode="eval").body + assert evaluate_ast_node(node, self.context_map) == 7 + + def test_unsupported_ast_node(self): + node = ast.parse("lambda x: x + 1", mode="eval").body + with pytest.raises(ValueError): + evaluate_ast_node(node) + + def test_parse_function_call_expr_valid(self): + func_expr = "add(3, 4)" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "add" + assert args == [3, 4] + assert kwargs == {} + + def test_parse_function_call_expr_with_kwargs(self): + self.context_map["power"] = lambda x, y=2: x**y + func_expr = "power(3, y=3)" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "power" + assert args == [3] + assert kwargs == {"y": 3} + + def test_parse_function_call_expr_invalid(self): + func_expr = "3 + 4" + with pytest.raises(ValueError): + parse_function_call_expr(func_expr, self.context_map) + + def test_evaluate_nested_function_calls(self): + node = ast.parse("add(multiply(2, 3), 4)", mode="eval").body + assert evaluate_ast_node(node, self.context_map) == 10 + + def test_evaluate_with_variable_replacement(self): + func_expr = "add(x, y)" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "add" + assert args == [10, 5] + assert kwargs == {} + assert self.context_map["add"](*args, **kwargs) == 15 + + def test_evaluate_with_wrong_keyword(self): + func_expr = "add(x, y=5)" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "add" + assert args == [10] + assert kwargs == {"y": 5} + with pytest.raises(TypeError): + self.context_map["add"](*args, **kwargs) + + def test_evaluate_with_variable_replacement_and_kwargs(self): + func_expr = "add(x, b=y)" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "add" + assert args == [10] + assert kwargs == {"b": 5} + assert self.context_map["add"](*args, **kwargs) == 15 + + def test_evaluate_with_unknown_variable(self): + func_expr = "add(x, z)" + with pytest.raises(ValueError): + parse_function_call_expr(func_expr, self.context_map) + + def test_evaluate_complex_list(self): + node = ast.parse("[1, [2, 3], {'a': 4, 'b': [5, 6]}]", mode="eval").body + result = evaluate_ast_node(node) + assert result == [1, [2, 3], {"a": 4, "b": [5, 6]}] + + def test_evaluate_complex_dict(self): + node = ast.parse( + "{'key1': 1, 'key2': {'subkey1': [1, 2, 3], 'subkey2': {'subsubkey': 4}}}", + mode="eval", + ).body + result = evaluate_ast_node(node) + assert result == { + "key1": 1, + "key2": {"subkey1": [1, 2, 3], "subkey2": {"subsubkey": 4}}, + } + + def test_evaluate_with_dataclass(self): + func_expr = "add_points(Point(1, 2), Point(3, 4))" + func_name, args, kwargs = parse_function_call_expr(func_expr, self.context_map) + assert func_name == "add_points" + assert args == [Point(1, 2), Point(3, 4)] + assert kwargs == {} + result = self.context_map[func_name](*args, **kwargs) + assert result == Point(4, 6) + + def test_evaluate_numpy_array(self): + node = ast.parse("array([1, 2, 3, 4])", mode="eval").body + result = evaluate_ast_node(node, self.context_map) + np.testing.assert_array_equal(result, np.array([1, 2, 3, 4])) + + def test_evaluate_numpy_sum(self): + node = ast.parse("sum(array([1, 2, 3, 4]))", mode="eval").body + result = evaluate_ast_node(node, self.context_map) + assert result == 10 + + def test_evaluate_numpy_mean(self): + node = ast.parse("mean(array([1, 2, 3, 4]))", mode="eval").body + result = evaluate_ast_node(node, self.context_map) + assert result == 2.5 + + def test_evaluate_numpy_sum_2d(self): + node = ast.parse("numpy_sum(arr=np.array([[1, 2], [3, 4]]))", mode="eval").body + result = evaluate_ast_node(node, self.context_map) + assert result == 10 + + +if __name__ == "__main__": + pytest.main() diff --git a/lightrag/tests/test_generator.py b/lightrag/tests/test_generator.py index ac4c0de6..953e73d2 100644 --- a/lightrag/tests/test_generator.py +++ b/lightrag/tests/test_generator.py @@ -1,4 +1,3 @@ -import pytest from unittest import IsolatedAsyncioTestCase from unittest.mock import patch, Mock import os @@ -7,15 +6,17 @@ from lightrag.core.types import GeneratorOutput from lightrag.core.generator import Generator + from lightrag.core.model_client import ModelClient from lightrag.tracing import GeneratorStateLogger -import lightrag.utils.setup_env class TestGenerator(IsolatedAsyncioTestCase): def setUp(self): # Assuming that OpenAIClient is correctly mocked and passed to Generator - with patch("core.model_client.ModelClient", spec=ModelClient) as MockAPI: + with patch( + "lightrag.core.model_client.ModelClient", spec=ModelClient + ) as MockAPI: mock_api_client = Mock(ModelClient) MockAPI.return_value = mock_api_client mock_api_client.call.return_value = "Generated text response" @@ -75,7 +76,7 @@ def test_generator_prompt_update(self): # Update the prompt variable and value preset_prompt_kwargs = {"input_str": "Hello, updated world!"} generator = Generator( - model_client=self.mock_api_client, preset_prompt_kwargs=preset_prompt_kwargs + model_client=self.mock_api_client, prompt_kwargs=preset_prompt_kwargs ) prompt_logger.log_prompt(generator=generator, name="Test Generator") @@ -84,7 +85,7 @@ def test_generator_prompt_update(self): ) self.assertEqual( prompt_logger._trace_map["Test Generator"][1].prompt_states["data"][ - "preset_prompt_kwargs" + "prompt_kwargs" ]["input_str"], "Hello, updated world!", ) diff --git a/lightrag/tests/test_gt_text_splitter.py b/lightrag/tests/test_gt_text_splitter.py deleted file mode 100644 index 8c3aa4d2..00000000 --- a/lightrag/tests/test_gt_text_splitter.py +++ /dev/null @@ -1,143 +0,0 @@ -import unittest -from lightrag.core.types import Document -from lightrag.components.data_process.text_splitter import TextSplitter # Import your TextSplitter -from lightrag.components.data_process.document_splitter import DocumentSplitter # Import the ground truth splitter - - -class TestTextSplitterComparison(unittest.TestCase): - - def setUp(self): - self.text_splitter = TextSplitter(split_by="word", chunk_size=5, chunk_overlap=2) - self.ground_truth_splitter = DocumentSplitter(split_by="word", split_length=5, split_overlap=2) - - def compare_splits(self, text): - expected = self.ground_truth_splitter.split_text(text) - result = self.text_splitter.split_text(text) - - print(f"expected: {expected}") - print(f"result: {result}") - self.assertEqual(result, expected) - - def test_exact_chunk_size(self): - text = "one two three four five" - self.compare_splits(text) - - def test_less_than_chunk_size(self): - text = "one two" - self.compare_splits(text) - - def test_single_word(self): - text = "word" - self.compare_splits(text) - - def test_overlap_handling(self): - text = "one two three four five six seven" - self.compare_splits(text) - - def test_multiple_chunks_with_overlap(self): - text = "one two three four five six seven eight nine ten eleven" - self.compare_splits(text) - - def test_end_index_matches_length(self): - text = "one two three four five six" - self.compare_splits(text) - - def test_long_text(self): - text = " ".join(["word"] * 50) - self.compare_splits(text) - - def test_split_by_sentence(self): - self.text_splitter = TextSplitter(split_by="sentence", chunk_size=1, chunk_overlap=0) - self.ground_truth_splitter = DocumentSplitter(split_by="sentence", split_length=1, split_overlap=0) - text = "This is a test. It should work well." - - self.compare_splits(text) - - def test_split_by_page(self): - self.text_splitter = TextSplitter(split_by="page", chunk_size=1, chunk_overlap=0) - self.ground_truth_splitter = DocumentSplitter(split_by="page", split_length=1, split_overlap=0) - text = "This is a test\fThis is another page" - self.compare_splits(text) - - def test_split_by_passage(self): - self.text_splitter = TextSplitter(split_by="passage", chunk_size=1, chunk_overlap=0) - self.ground_truth_splitter = DocumentSplitter(split_by="passage", split_length=1, split_overlap=0) - text = "This is a test\n\nThis is another passage" - self.compare_splits(text) - - def test_empty_text(self): - text = "" - self.compare_splits(text) - - def test_special_characters(self): - text = "one! two@ three# four$ five% six^ seven& eight* nine( ten)" - self.compare_splits(text) - - def test_multiple_spaces(self): - text = "one two three four five" - self.compare_splits(text) - - def test_newlines(self): - text = "one\ntwo\nthree\nfour\nfive\nsix" - self.compare_splits(text) - - def test_tabs(self): - text = "one\ttwo\tthree\tfour\tfive\tsix" - self.compare_splits(text) - - def test_varied_delimiters(self): - text = "one. two, three; four: five! six? seven" - self.compare_splits(text) - - def test_text_with_punctuation(self): - text = "Hello, world! This is a test. Let's see how it works." - self.compare_splits(text) - - def test_long_paragraph(self): - text = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " - "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. " - "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. " - "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." - ) - self.compare_splits(text) - - def test_trailing_whitespace(self): - text = "one two three four five " - self.compare_splits(text) - - def test_leading_whitespace(self): - text = " one two three four five" - self.compare_splits(text) - - def test_mixed_whitespace(self): - text = " one two three four five " - self.compare_splits(text) - - def test_chunk_size_greater_than_overlap(self): - self.text_splitter = TextSplitter(split_by="word", chunk_size=4, chunk_overlap=2) - self.ground_truth_splitter = DocumentSplitter(split_by="word", split_length=4, split_overlap=2) - text = "one two three four five six seven eight nine ten" - self.compare_splits(text) - - def test_overlap_zero(self): - self.text_splitter = TextSplitter(split_by="word", chunk_size=4, chunk_overlap=0) - self.ground_truth_splitter = DocumentSplitter(split_by="word", split_length=4, split_overlap=0) - text = "one two three four five six seven eight nine ten" - self.compare_splits(text) - - def test_overlap_zero_end(self): - self.text_splitter = TextSplitter(split_by="word", chunk_size=5, chunk_overlap=0) - self.ground_truth_splitter = DocumentSplitter(split_by="word", split_length=5, split_overlap=0) - text = "one two three four five six seven eight nine ten" - self.compare_splits(text) - - def test_invalid_parameters(self): - with self.assertRaises(ValueError): - TextSplitter(split_by="word", chunk_size=-1, chunk_overlap=2) - with self.assertRaises(ValueError): - TextSplitter(split_by="word", chunk_size=5, chunk_overlap=6) - - -if __name__ == '__main__': - unittest.main() diff --git a/lightrag/tests/test_string_parser.py b/lightrag/tests/test_string_parser.py index b05c46de..ab07fdf3 100644 --- a/lightrag/tests/test_string_parser.py +++ b/lightrag/tests/test_string_parser.py @@ -1,3 +1,5 @@ +import pytest + from lightrag.core.string_parser import ( JsonParser, ) @@ -8,8 +10,6 @@ fix_json_escaped_single_quotes, ) -import pytest - ################################################## # Test cases for extract_json_str function @@ -33,6 +33,19 @@ def test_extract_json_str_no_json(): extract_json_str(text) +def test_extract_list_json_str(): + text = '["item1", "item2"]' + assert extract_json_str(text) == '["item1", "item2"]', "Failed to extract list" + + +def test_extract_json_arr_str(): + text = '```json\n[\n {\n "action": "add(2, b=3)"\n },\n {\n "action": "search(query=\'something\')"\n }\n]\n```' + assert ( + extract_json_str(text) + == '[\n {\n "action": "add(2, b=3)"\n },\n {\n "action": "search(query=\'something\')"\n }\n]' + ) + + ################################################## # Test cases for fix_json_formatting function ################################################## @@ -114,6 +127,23 @@ def test_json_parser_fix_missing_commas(): } +def test_json_parser_numpy_array(): + text = """```json +{ + "name": "numpy_sum", + "kwargs": { + "arr": [[1, 2], [3, 4]] + } +} +```""" + parser = JsonParser() + result = parser(text) + assert result == { + "name": "numpy_sum", + "kwargs": {"arr": [[1, 2], [3, 4]]}, + } + + def test_json_parser_handling_decode_error(): parser = JsonParser() # Deliberately malformed JSON that is also problematic for YAML diff --git a/lightrag/tests/test_text_splitter.py b/lightrag/tests/test_text_splitter.py new file mode 100644 index 00000000..eb03a3de --- /dev/null +++ b/lightrag/tests/test_text_splitter.py @@ -0,0 +1,64 @@ +import unittest +from lightrag.core.types import Document +from lightrag.components.data_process.text_splitter import TextSplitter + + +class TestTextSplitter(unittest.TestCase): + + def setUp(self): + # Set up a TextSplitter instance before each test + self.splitter = TextSplitter(split_by="word", chunk_size=5, chunk_overlap=2) + + # def test_invalid_split_by(self): + # # Test initialization with invalid split_by value + # with self.assertRaises(ValueError): + # TextSplitter(split_by="invalid", chunk_size=5, chunk_overlap=0) + + # def test_negative_chunk_size(self): + # # Test initialization with negative chunk_size + # with self.assertRaises(ValueError): + # TextSplitter(split_by="word", chunk_size=-1, chunk_overlap=0) + + # def test_negative_chunk_overlap(self): + # # Test initialization with negative chunk_overlap + # with self.assertRaises(ValueError): + # TextSplitter(split_by="word", chunk_size=5, chunk_overlap=-1) + + def test_split_by_word(self): + # Test the basic functionality of splitting by word + text = "This is a simple test" + expected = ["This is a simple test"] + result = self.splitter.split_text(text) + self.assertEqual(result, expected) + + def test_split_by_sentence(self): + # Test splitting by sentence + splitter = TextSplitter(split_by="sentence", chunk_size=1, chunk_overlap=0) + text = "This is a test. It should work well." + expected = ["This is a test.", " It should work well."] + result = splitter.split_text(text) + self.assertEqual(result, expected) + + def test_overlap_handling(self): + # Test proper handling of overlap + text = "one two three four five six seven" + expected = ["one two three four five ", "four five six seven"] + result = self.splitter.split_text(text) + self.assertEqual(result, expected) + + def test_document_splitting(self): + # Test splitting a list of documents + docs = [Document(text="This is a simple test to check splitting.", id="1")] + expected_texts = ["This is a simple test ", "simple test to check splitting."] + result = self.splitter.call(docs) + result_texts = [doc.text for doc in result] + self.assertEqual(result_texts, expected_texts) + + # def test_empty_text_handling(self): + # # Test handling of empty text + # with self.assertRaises(ValueError): + # self.splitter.call([Document(text=None, id="1")]) + + +if __name__ == "__main__": + unittest.main() diff --git a/lightrag/tests/test_tool.py b/lightrag/tests/test_tool.py index 00524cf5..818c87f5 100644 --- a/lightrag/tests/test_tool.py +++ b/lightrag/tests/test_tool.py @@ -1,7 +1,8 @@ import pytest from dataclasses import dataclass -from lightrag.core.tool_helper import FunctionTool, ToolMetadata, ToolOutput +from lightrag.core.func_tool import FunctionTool +from lightrag.core.types import FunctionDefinition @dataclass @@ -19,59 +20,60 @@ async def async_add(x, y): return x + y -metadata = ToolMetadata(description="A simple addition tool") +metadata = FunctionDefinition(func_desc="A simple addition tool", func_name="add") def test_function_tool_sync(): - tool = FunctionTool(metadata=metadata, fn=sync_add) + tool = FunctionTool(definition=metadata, fn=sync_add) print( - f"tool: {tool}, tool.metadata: {tool.metadata}, tool.fn: {tool.fn}, tool.async_fn: {tool.async_fn}" + f"tool: {tool}, tool.metadata: {tool.definition}, tool.fn: {tool.fn}, tool.async: {tool._is_async}" ) output = tool(1, 2) # Using __call__ which proxies to call() - assert output.raw_output == 3 - assert output.name is None # Since name is optional and not set - assert "args" in output.raw_input - assert output.raw_input["args"] == (1, 2) + assert output.output == 3 + assert output.name == "add", "The name should be set to the function name" + assert hasattr(output.input, "args") + assert output.input.args == (1, 2) def test_function_tool_async(): - tool = FunctionTool(metadata=metadata, async_fn=async_add) + # use default metadata + tool = FunctionTool(fn=async_add) import asyncio output = asyncio.run(tool.acall(3, 4)) - assert output.raw_output == 7 - assert output.name is None # Since name is optional and not set - assert "args" in output.raw_input - assert output.raw_input["args"] == (3, 4) + assert output.output == 7 + assert output.name == "async_add", "The name should be set to the function name" + assert hasattr(output.input, "args") + assert output.input.args == (3, 4) - -def test_invalid_function_tool_initialization(): - # Test initialization without any function should raise ValueError + # call with sync call with raise ValueError with pytest.raises(ValueError): - tool = FunctionTool(metadata=metadata) + tool.call(3, 4) + +# def test_invalid_function_tool_initialization(): +# # Test initialization without any function should raise ValueError +# with pytest.raises(ValueError): +# tool = FunctionTool(metadata=metadata) -def test_tool_output_str_content(): - output = ToolOutput(raw_input={}, raw_output=100) - assert str(output) == "100" +# def test_from_defaults_uses_function_docstring(): +# def sample_function(x, y, user: User = User(id=1, name="John")): +# """ +# Adds two numbers together and returns the sum. +# """ +# return x + y -def test_from_defaults_uses_function_docstring(): - def sample_function(x, y, user: User = User(id=1, name="John")): - """ - Adds two numbers together and returns the sum. - """ - return x + y +# tool = FunctionTool(fn=sample_function) - tool = FunctionTool.from_defaults(fn=sample_function) +# expected_description = sample_function.__doc__.strip() +# actual_description = tool.metadata.description +# print(f"Expected: {expected_description}, Actual: {actual_description}") - expected_description = sample_function.__doc__.strip() - actual_description = tool.metadata.description - print(f"Expected: {expected_description}, Actual: {actual_description}") - # # Check if the metadata description matches the function's docstring - # assert ( - # actual_description == expected_description - # ), f"The description should automatically be set to the function's docstring. Expected: {expected_description}, Actual: {actual_description}" +# # Check if the metadata description matches the function's docstring +# assert ( +# actual_description == expected_description +# ), f"The description should automatically be set to the function's docstring. Expected: {expected_description}, Actual: {actual_description}" diff --git a/lightrag/tests/test_transformer_client.py b/lightrag/tests/test_transformer_client.py index 33e498d4..78c5bdf2 100644 --- a/lightrag/tests/test_transformer_client.py +++ b/lightrag/tests/test_transformer_client.py @@ -1,12 +1,6 @@ import unittest import torch -from lightrag.components.model_client import ( - TransformersClient, - TransformerReranker, - TransformerEmbedder, -) -from lightrag.core.types import ModelType # Set the number of threads for PyTorch, avoid segementation fault torch.set_num_threads(1) @@ -22,81 +16,104 @@ def setUp(self) -> None: "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.", ] - def test_transformer_embedder(self): - transformer_embedder_model = "thenlper/gte-base" - transformer_embedder_model_component = TransformerEmbedder( - model_name=transformer_embedder_model - ) - print( - f"Testing transformer embedder with model {transformer_embedder_model_component}" - ) - print("Testing transformer embedder") - output = transformer_embedder_model_component( - model=transformer_embedder_model, input="Hello world" - ) - print(output) - - def test_transformer_client(self): - transformer_client = TransformersClient() - print("Testing transformer client") - # run the model - kwargs = { - "model": "thenlper/gte-base", - # "mock": False, - } - api_kwargs = transformer_client.convert_inputs_to_api_kwargs( - input="Hello world", - model_kwargs=kwargs, - model_type=ModelType.EMBEDDER, - ) - # print(api_kwargs) - output = transformer_client.call( - api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER - ) - - # print(transformer_client) - # print(output) - - def test_transformer_reranker(self): - transformer_reranker_model = "BAAI/bge-reranker-base" - transformer_reranker_model_component = TransformerReranker() - # print( - # f"Testing transformer reranker with model {transformer_reranker_model_component}" - # ) - - model_kwargs = { - "model": transformer_reranker_model, - "documents": self.documents, - "query": self.query, - "top_k": 2, - } - - output = transformer_reranker_model_component( - **model_kwargs, - ) - # assert output is a list of float with length 2 - self.assertEqual(len(output), 2) - self.assertEqual(type(output[0]), float) - - def test_transformer_reranker_client(self): - transformer_reranker_client = TransformersClient( - model_name="BAAI/bge-reranker-base" - ) - print("Testing transformer reranker client") - # run the model - kwargs = { - "model": "BAAI/bge-reranker-base", - "documents": self.documents, - "top_k": 2, - } - api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs( - input=self.query, - model_kwargs=kwargs, - model_type=ModelType.RERANKER, - ) - print(api_kwargs) - self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base") - output = transformer_reranker_client.call( - api_kwargs=api_kwargs, model_type=ModelType.RERANKER - ) - self.assertEqual(type(output), tuple) + # def test_transformer_embedder(self): + # transformer_embedder_model = "thenlper/gte-base" + # transformer_embedder_model_component = TransformerEmbedder( + # model_name=transformer_embedder_model + # ) + # print( + # f"Testing transformer embedder with model {transformer_embedder_model_component}" + # ) + # print("Testing transformer embedder") + # output = transformer_embedder_model_component( + # model=transformer_embedder_model, input="Hello world" + # ) + # print(output) + + # def test_transformer_client(self): + # transformer_client = TransformersClient() + # print("Testing transformer client") + # # run the model + # kwargs = { + # "model": "thenlper/gte-base", + # # "mock": False, + # } + # api_kwargs = transformer_client.convert_inputs_to_api_kwargs( + # input="Hello world", + # model_kwargs=kwargs, + # model_type=ModelType.EMBEDDER, + # ) + # # print(api_kwargs) + # output = transformer_client.call( + # api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER + # ) + + # # print(transformer_client) + # # print(output) + + # def test_transformer_reranker(self): + # transformer_reranker_model = "BAAI/bge-reranker-base" + # transformer_reranker_model_component = TransformerReranker() + # # print( + # # f"Testing transformer reranker with model {transformer_reranker_model_component}" + # # ) + + # model_kwargs = { + # "model": transformer_reranker_model, + # "documents": self.documents, + # "query": self.query, + # "top_k": 2, + # } + + # output = transformer_reranker_model_component( + # **model_kwargs, + # ) + # # assert output is a list of float with length 2 + # self.assertEqual(len(output), 2) + # self.assertEqual(type(output[0]), float) + + # def test_transformer_reranker_client(self): + # transformer_reranker_client = TransformersClient( + # model_name="BAAI/bge-reranker-base" + # ) + # print("Testing transformer reranker client") + # # run the model + # kwargs = { + # "model": "BAAI/bge-reranker-base", + # "documents": self.documents, + # "top_k": 2, + # } + # api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs( + # input=self.query, + # model_kwargs=kwargs, + # model_type=ModelType.RERANKER, + # ) + # print(api_kwargs) + # self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base") + # output = transformer_reranker_client.call( + # api_kwargs=api_kwargs, model_type=ModelType.RERANKER + # ) + # self.assertEqual(type(output), tuple) + + # def test_transformer_llm_response(self): + # """Test the TransformerLLM model with zephyr-7b-beta for generating a response.""" + # transformer_llm_model = "HuggingFaceH4/zephyr-7b-beta" + # transformer_llm_model_component = TransformerLLM(model_name=transformer_llm_model) + + # # Define a sample input + # input_text = "Hello, what's the weather today?" + + # # Test generating a response, providing the 'model' keyword + # # response = transformer_llm_model_component(input=input_text, model=transformer_llm_model) + # response = transformer_llm_model_component(input_text=input_text) + + # # Check if the response is valid + # self.assertIsInstance(response, str, "The response should be a string.") + # self.assertTrue(len(response) > 0, "The response should not be empty.") + + # # Optionally, print the response for visual verification during testing + # print(f"Generated response: {response}") + + +if __name__ == "__main__": + unittest.main() diff --git a/lightrag/utils/setup_env.py b/lightrag/utils/setup_env.py deleted file mode 100644 index 15fee232..00000000 --- a/lightrag/utils/setup_env.py +++ /dev/null @@ -1,3 +0,0 @@ -import dotenv - -dotenv.load_dotenv(dotenv_path=".env", override=True) diff --git a/poetry.lock b/poetry.lock index ff2a772b..e8bd21e3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -494,17 +494,17 @@ css = ["tinycss2 (>=1.1.0,<1.3)"] [[package]] name = "boto3" -version = "1.34.135" +version = "1.34.136" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.34.135-py3-none-any.whl", hash = "sha256:6f5d7a20afbe45e3f7c6b5e96071752d36c3942535b1f7924964f1fdf25376a7"}, - {file = "boto3-1.34.135.tar.gz", hash = "sha256:344f635233c85dbb509b87638232ff9132739f90bb5e6bf01fa0e0a521a9107e"}, + {file = "boto3-1.34.136-py3-none-any.whl", hash = "sha256:d41037e2c680ab8d6c61a0a4ee6bf1fdd9e857f43996672830a95d62d6f6fa79"}, + {file = "boto3-1.34.136.tar.gz", hash = "sha256:0314e6598f59ee0f34eb4e6d1a0f69fa65c146d2b88a6e837a527a9956ec2731"}, ] [package.dependencies] -botocore = ">=1.34.135,<1.35.0" +botocore = ">=1.34.136,<1.35.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -513,13 +513,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.34.135" +version = "1.34.136" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.34.135-py3-none-any.whl", hash = "sha256:3aa9e85e7c479babefb5a590e844435449df418085f3c74d604277bc52dc3109"}, - {file = "botocore-1.34.135.tar.gz", hash = "sha256:2e72f37072f75cb1391fca9d7a4c32cecb52a3557d62431d0f59d5311dc7d0cf"}, + {file = "botocore-1.34.136-py3-none-any.whl", hash = "sha256:c63fe9032091fb9e9477706a3ebfa4d0c109b807907051d892ed574f9b573e61"}, + {file = "botocore-1.34.136.tar.gz", hash = "sha256:7f7135178692b39143c8f152a618d2a3b71065a317569a7102d2306d4946f42f"}, ] [package.dependencies] @@ -791,6 +791,84 @@ traitlets = ">=4" [package.extras] test = ["pytest"] +[[package]] +name = "contourpy" +version = "1.2.1" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = false +python-versions = ">=3.9" +files = [ + {file = "contourpy-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bd7c23df857d488f418439686d3b10ae2fbf9bc256cd045b37a8c16575ea1040"}, + {file = "contourpy-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5b9eb0ca724a241683c9685a484da9d35c872fd42756574a7cfbf58af26677fd"}, + {file = "contourpy-1.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c75507d0a55378240f781599c30e7776674dbaf883a46d1c90f37e563453480"}, + {file = "contourpy-1.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11959f0ce4a6f7b76ec578576a0b61a28bdc0696194b6347ba3f1c53827178b9"}, + {file = "contourpy-1.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb3315a8a236ee19b6df481fc5f997436e8ade24a9f03dfdc6bd490fea20c6da"}, + {file = "contourpy-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39f3ecaf76cd98e802f094e0d4fbc6dc9c45a8d0c4d185f0f6c2234e14e5f75b"}, + {file = "contourpy-1.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:94b34f32646ca0414237168d68a9157cb3889f06b096612afdd296003fdd32fd"}, + {file = "contourpy-1.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:457499c79fa84593f22454bbd27670227874cd2ff5d6c84e60575c8b50a69619"}, + {file = "contourpy-1.2.1-cp310-cp310-win32.whl", hash = "sha256:ac58bdee53cbeba2ecad824fa8159493f0bf3b8ea4e93feb06c9a465d6c87da8"}, + {file = "contourpy-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:9cffe0f850e89d7c0012a1fb8730f75edd4320a0a731ed0c183904fe6ecfc3a9"}, + {file = "contourpy-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6022cecf8f44e36af10bd9118ca71f371078b4c168b6e0fab43d4a889985dbb5"}, + {file = "contourpy-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ef5adb9a3b1d0c645ff694f9bca7702ec2c70f4d734f9922ea34de02294fdf72"}, + {file = "contourpy-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6150ffa5c767bc6332df27157d95442c379b7dce3a38dff89c0f39b63275696f"}, + {file = "contourpy-1.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c863140fafc615c14a4bf4efd0f4425c02230eb8ef02784c9a156461e62c965"}, + {file = "contourpy-1.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:00e5388f71c1a0610e6fe56b5c44ab7ba14165cdd6d695429c5cd94021e390b2"}, + {file = "contourpy-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4492d82b3bc7fbb7e3610747b159869468079fe149ec5c4d771fa1f614a14df"}, + {file = "contourpy-1.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:49e70d111fee47284d9dd867c9bb9a7058a3c617274900780c43e38d90fe1205"}, + {file = "contourpy-1.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b59c0ffceff8d4d3996a45f2bb6f4c207f94684a96bf3d9728dbb77428dd8cb8"}, + {file = "contourpy-1.2.1-cp311-cp311-win32.whl", hash = "sha256:7b4182299f251060996af5249c286bae9361fa8c6a9cda5efc29fe8bfd6062ec"}, + {file = "contourpy-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2855c8b0b55958265e8b5888d6a615ba02883b225f2227461aa9127c578a4922"}, + {file = "contourpy-1.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:62828cada4a2b850dbef89c81f5a33741898b305db244904de418cc957ff05dc"}, + {file = "contourpy-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:309be79c0a354afff9ff7da4aaed7c3257e77edf6c1b448a779329431ee79d7e"}, + {file = "contourpy-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e785e0f2ef0d567099b9ff92cbfb958d71c2d5b9259981cd9bee81bd194c9a4"}, + {file = "contourpy-1.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cac0a8f71a041aa587410424ad46dfa6a11f6149ceb219ce7dd48f6b02b87a7"}, + {file = "contourpy-1.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af3f4485884750dddd9c25cb7e3915d83c2db92488b38ccb77dd594eac84c4a0"}, + {file = "contourpy-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ce6889abac9a42afd07a562c2d6d4b2b7134f83f18571d859b25624a331c90b"}, + {file = "contourpy-1.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a1eea9aecf761c661d096d39ed9026574de8adb2ae1c5bd7b33558af884fb2ce"}, + {file = "contourpy-1.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:187fa1d4c6acc06adb0fae5544c59898ad781409e61a926ac7e84b8f276dcef4"}, + {file = "contourpy-1.2.1-cp312-cp312-win32.whl", hash = "sha256:c2528d60e398c7c4c799d56f907664673a807635b857df18f7ae64d3e6ce2d9f"}, + {file = "contourpy-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:1a07fc092a4088ee952ddae19a2b2a85757b923217b7eed584fdf25f53a6e7ce"}, + {file = "contourpy-1.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bb6834cbd983b19f06908b45bfc2dad6ac9479ae04abe923a275b5f48f1a186b"}, + {file = "contourpy-1.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1d59e739ab0e3520e62a26c60707cc3ab0365d2f8fecea74bfe4de72dc56388f"}, + {file = "contourpy-1.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd3db01f59fdcbce5b22afad19e390260d6d0222f35a1023d9adc5690a889364"}, + {file = "contourpy-1.2.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a12a813949e5066148712a0626895c26b2578874e4cc63160bb007e6df3436fe"}, + {file = "contourpy-1.2.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe0ccca550bb8e5abc22f530ec0466136379c01321fd94f30a22231e8a48d985"}, + {file = "contourpy-1.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1d59258c3c67c865435d8fbeb35f8c59b8bef3d6f46c1f29f6123556af28445"}, + {file = "contourpy-1.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f32c38afb74bd98ce26de7cc74a67b40afb7b05aae7b42924ea990d51e4dac02"}, + {file = "contourpy-1.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d31a63bc6e6d87f77d71e1abbd7387ab817a66733734883d1fc0021ed9bfa083"}, + {file = "contourpy-1.2.1-cp39-cp39-win32.whl", hash = "sha256:ddcb8581510311e13421b1f544403c16e901c4e8f09083c881fab2be80ee31ba"}, + {file = "contourpy-1.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:10a37ae557aabf2509c79715cd20b62e4c7c28b8cd62dd7d99e5ed3ce28c3fd9"}, + {file = "contourpy-1.2.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a31f94983fecbac95e58388210427d68cd30fe8a36927980fab9c20062645609"}, + {file = "contourpy-1.2.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef2b055471c0eb466033760a521efb9d8a32b99ab907fc8358481a1dd29e3bd3"}, + {file = "contourpy-1.2.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b33d2bc4f69caedcd0a275329eb2198f560b325605810895627be5d4b876bf7f"}, + {file = "contourpy-1.2.1.tar.gz", hash = "sha256:4d8908b3bee1c889e547867ca4cdc54e5ab6be6d3e078556814a22457f49423c"}, +] + +[package.dependencies] +numpy = ">=1.20" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.8.0)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-xdist", "wurlitzer"] + +[[package]] +name = "cycler" +version = "0.12.1" +description = "Composable style cycles" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, + {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, +] + +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] + [[package]] name = "dataclasses-json" version = "0.6.7" @@ -1015,45 +1093,6 @@ files = [ [package.extras] tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] -[[package]] -name = "faiss-cpu" -version = "1.8.0.post1" -description = "A library for efficient similarity search and clustering of dense vectors." -optional = false -python-versions = ">=3.8" -files = [ - {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:fd84721eb599aa1da19b1b36345bb8705a60bb1d2887bbbc395a29e3d36a1a62"}, - {file = "faiss_cpu-1.8.0.post1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b78ff9079d15fd0f156bf5dd8a2975a8abffac1854a86ece263eec1500a2e836"}, - {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de25c943d1789e35fe06a20884c88cd32aedbb1a33bb8da2238cdea7bd9633f"}, - {file = "faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adae0f1b144e7216da696f14bc4991ca4300c94baaa59247c3d322588e661c95"}, - {file = "faiss_cpu-1.8.0.post1-cp310-cp310-win_amd64.whl", hash = "sha256:00345290680a444a4b4cb2d98a3844bb5c401a2160fee547c7631d759fd2ec3e"}, - {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8d4bade10cb63e9f9ff261751edd7eb097b1f4bf30be4d0d25d6f688559d795e"}, - {file = "faiss_cpu-1.8.0.post1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:20bd43eca3b7d77e71ea56b7a558cc28e900d8abff417eb285e2d92e95d934d4"}, - {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8542a87743a7f94ac656fd3e9592ad57e58b04d961ad2fe654a22a8ca59defdb"}, - {file = "faiss_cpu-1.8.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed46928de3dc20170b10fec89c54075a11383c2aaf4f119c63e0f6ae5a507d74"}, - {file = "faiss_cpu-1.8.0.post1-cp311-cp311-win_amd64.whl", hash = "sha256:4fa5fc8ea210b919aa469e27d6687e50052db906e7fec3f2257178b1384fa18b"}, - {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:96aec0d08a3099883af3a9b6356cfe736e8bd879318a940a27e9d1ae6f33d788"}, - {file = "faiss_cpu-1.8.0.post1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:92b06147fa84732ecdc965922e8ef50dc7011ef8be65821ff4abb2118cb5dce0"}, - {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:709ef9394d1148aef70dbe890edbde8c282a4a2e06a8b69ab64f65e90f5ba572"}, - {file = "faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:327a9c30971bf72cd8392b15eb4aff5d898c453212eae656dfaa3ba555b9ca0c"}, - {file = "faiss_cpu-1.8.0.post1-cp312-cp312-win_amd64.whl", hash = "sha256:8756f1d93faba56349883fa2f5d47fe36bb2f11f789200c6b1c691ef805485f2"}, - {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f4a3045909c447bf1955b70083891e80f2c87c5427f20cae25245e08ec5c9e52"}, - {file = "faiss_cpu-1.8.0.post1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8842b7fc921ca1fafdb0845f2ba029e79df04eebae72ab135239f93478a9b7a2"}, - {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d5a9799634e32c3862d5436d1e78112ed9a38f319e4523f5916e55d86adda8f"}, - {file = "faiss_cpu-1.8.0.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a70923b0fbbb40f647e20bcbcbfd472277e6d84bb23ff12d2a94b6841806b55"}, - {file = "faiss_cpu-1.8.0.post1-cp38-cp38-win_amd64.whl", hash = "sha256:ce652df3c4dd50c88ac9235d072f30ce60694dc422c5f523bbbcab320e8f3097"}, - {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:83ef04b17b19189dd6601a941bdf4bfa9de0740dbcd80305aeba51a1b1955f80"}, - {file = "faiss_cpu-1.8.0.post1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c50c8697077470ede7f1939ef8dc8a846ec19cf1893b543f6b67f9af03b0a122"}, - {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98ce428a7a67fe5c64047280e5e12a8dbdecf7002f9d127b26cf1db354e9fe76"}, - {file = "faiss_cpu-1.8.0.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f3b36b80380bae523e3198cfb4a137867055945ce7bf10d18fe9f0284f2fb47"}, - {file = "faiss_cpu-1.8.0.post1-cp39-cp39-win_amd64.whl", hash = "sha256:4fcc67a2353f08a20c1ab955de3cde14ef3b447761b26244a5aa849c15cbc9b3"}, - {file = "faiss_cpu-1.8.0.post1.tar.gz", hash = "sha256:5686af34414678c3d49c4fa8d774df7156e9cb48d7029071e56230e74b01cc13"}, -] - -[package.dependencies] -numpy = ">=1.0,<2.0" -packaging = "*" - [[package]] name = "fastavro" version = "1.9.4" @@ -1147,6 +1186,71 @@ sentence_transformers = "*" torch = ">=1.6.0" transformers = ">=4.33.0" +[[package]] +name = "fonttools" +version = "4.53.0" +description = "Tools to manipulate font files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fonttools-4.53.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:52a6e0a7a0bf611c19bc8ec8f7592bdae79c8296c70eb05917fd831354699b20"}, + {file = "fonttools-4.53.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:099634631b9dd271d4a835d2b2a9e042ccc94ecdf7e2dd9f7f34f7daf333358d"}, + {file = "fonttools-4.53.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e40013572bfb843d6794a3ce076c29ef4efd15937ab833f520117f8eccc84fd6"}, + {file = "fonttools-4.53.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:715b41c3e231f7334cbe79dfc698213dcb7211520ec7a3bc2ba20c8515e8a3b5"}, + {file = "fonttools-4.53.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74ae2441731a05b44d5988d3ac2cf784d3ee0a535dbed257cbfff4be8bb49eb9"}, + {file = "fonttools-4.53.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:95db0c6581a54b47c30860d013977b8a14febc206c8b5ff562f9fe32738a8aca"}, + {file = "fonttools-4.53.0-cp310-cp310-win32.whl", hash = "sha256:9cd7a6beec6495d1dffb1033d50a3f82dfece23e9eb3c20cd3c2444d27514068"}, + {file = "fonttools-4.53.0-cp310-cp310-win_amd64.whl", hash = "sha256:daaef7390e632283051e3cf3e16aff2b68b247e99aea916f64e578c0449c9c68"}, + {file = "fonttools-4.53.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a209d2e624ba492df4f3bfad5996d1f76f03069c6133c60cd04f9a9e715595ec"}, + {file = "fonttools-4.53.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4f520d9ac5b938e6494f58a25c77564beca7d0199ecf726e1bd3d56872c59749"}, + {file = "fonttools-4.53.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eceef49f457253000e6a2d0f7bd08ff4e9fe96ec4ffce2dbcb32e34d9c1b8161"}, + {file = "fonttools-4.53.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa1f3e34373aa16045484b4d9d352d4c6b5f9f77ac77a178252ccbc851e8b2ee"}, + {file = "fonttools-4.53.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:28d072169fe8275fb1a0d35e3233f6df36a7e8474e56cb790a7258ad822b6fd6"}, + {file = "fonttools-4.53.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4a2a6ba400d386e904fd05db81f73bee0008af37799a7586deaa4aef8cd5971e"}, + {file = "fonttools-4.53.0-cp311-cp311-win32.whl", hash = "sha256:bb7273789f69b565d88e97e9e1da602b4ee7ba733caf35a6c2affd4334d4f005"}, + {file = "fonttools-4.53.0-cp311-cp311-win_amd64.whl", hash = "sha256:9fe9096a60113e1d755e9e6bda15ef7e03391ee0554d22829aa506cdf946f796"}, + {file = "fonttools-4.53.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d8f191a17369bd53a5557a5ee4bab91d5330ca3aefcdf17fab9a497b0e7cff7a"}, + {file = "fonttools-4.53.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:93156dd7f90ae0a1b0e8871032a07ef3178f553f0c70c386025a808f3a63b1f4"}, + {file = "fonttools-4.53.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bff98816cb144fb7b85e4b5ba3888a33b56ecef075b0e95b95bcd0a5fbf20f06"}, + {file = "fonttools-4.53.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:973d030180eca8255b1bce6ffc09ef38a05dcec0e8320cc9b7bcaa65346f341d"}, + {file = "fonttools-4.53.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c4ee5a24e281fbd8261c6ab29faa7fd9a87a12e8c0eed485b705236c65999109"}, + {file = "fonttools-4.53.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd5bc124fae781a4422f61b98d1d7faa47985f663a64770b78f13d2c072410c2"}, + {file = "fonttools-4.53.0-cp312-cp312-win32.whl", hash = "sha256:a239afa1126b6a619130909c8404070e2b473dd2b7fc4aacacd2e763f8597fea"}, + {file = "fonttools-4.53.0-cp312-cp312-win_amd64.whl", hash = "sha256:45b4afb069039f0366a43a5d454bc54eea942bfb66b3fc3e9a2c07ef4d617380"}, + {file = "fonttools-4.53.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:93bc9e5aaa06ff928d751dc6be889ff3e7d2aa393ab873bc7f6396a99f6fbb12"}, + {file = "fonttools-4.53.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2367d47816cc9783a28645bc1dac07f8ffc93e0f015e8c9fc674a5b76a6da6e4"}, + {file = "fonttools-4.53.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:907fa0b662dd8fc1d7c661b90782ce81afb510fc4b7aa6ae7304d6c094b27bce"}, + {file = "fonttools-4.53.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e0ad3c6ea4bd6a289d958a1eb922767233f00982cf0fe42b177657c86c80a8f"}, + {file = "fonttools-4.53.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:73121a9b7ff93ada888aaee3985a88495489cc027894458cb1a736660bdfb206"}, + {file = "fonttools-4.53.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ee595d7ba9bba130b2bec555a40aafa60c26ce68ed0cf509983e0f12d88674fd"}, + {file = "fonttools-4.53.0-cp38-cp38-win32.whl", hash = "sha256:fca66d9ff2ac89b03f5aa17e0b21a97c21f3491c46b583bb131eb32c7bab33af"}, + {file = "fonttools-4.53.0-cp38-cp38-win_amd64.whl", hash = "sha256:31f0e3147375002aae30696dd1dc596636abbd22fca09d2e730ecde0baad1d6b"}, + {file = "fonttools-4.53.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7d6166192dcd925c78a91d599b48960e0a46fe565391c79fe6de481ac44d20ac"}, + {file = "fonttools-4.53.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef50ec31649fbc3acf6afd261ed89d09eb909b97cc289d80476166df8438524d"}, + {file = "fonttools-4.53.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f193f060391a455920d61684a70017ef5284ccbe6023bb056e15e5ac3de11d1"}, + {file = "fonttools-4.53.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba9f09ff17f947392a855e3455a846f9855f6cf6bec33e9a427d3c1d254c712f"}, + {file = "fonttools-4.53.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0c555e039d268445172b909b1b6bdcba42ada1cf4a60e367d68702e3f87e5f64"}, + {file = "fonttools-4.53.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5a4788036201c908079e89ae3f5399b33bf45b9ea4514913f4dbbe4fac08efe0"}, + {file = "fonttools-4.53.0-cp39-cp39-win32.whl", hash = "sha256:d1a24f51a3305362b94681120c508758a88f207fa0a681c16b5a4172e9e6c7a9"}, + {file = "fonttools-4.53.0-cp39-cp39-win_amd64.whl", hash = "sha256:1e677bfb2b4bd0e5e99e0f7283e65e47a9814b0486cb64a41adf9ef110e078f2"}, + {file = "fonttools-4.53.0-py3-none-any.whl", hash = "sha256:6b4f04b1fbc01a3569d63359f2227c89ab294550de277fd09d8fca6185669fa4"}, + {file = "fonttools-4.53.0.tar.gz", hash = "sha256:c93ed66d32de1559b6fc348838c7572d5c0ac1e4a258e76763a5caddd8944002"}, +] + +[package.extras] +all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "fs (>=2.2.0,<3)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.23.0)", "unicodedata2 (>=15.1.0)", "xattr", "zopfli (>=0.1.4)"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["munkres", "pycairo", "scipy"] +lxml = ["lxml (>=4.0)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.23.0)"] +symfont = ["sympy"] +type1 = ["xattr"] +ufo = ["fs (>=2.2.0,<3)"] +unicode = ["unicodedata2 (>=15.1.0)"] +woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] + [[package]] name = "fqdn" version = "1.5.1" @@ -1347,13 +1451,13 @@ uritemplate = ">=3.0.1,<5" [[package]] name = "google-auth" -version = "2.30.0" +version = "2.31.0" description = "Google Authentication Library" optional = false python-versions = ">=3.7" files = [ - {file = "google-auth-2.30.0.tar.gz", hash = "sha256:ab630a1320f6720909ad76a7dbdb6841cdf5c66b328d690027e4867bdfb16688"}, - {file = "google_auth-2.30.0-py2.py3-none-any.whl", hash = "sha256:8df7da660f62757388b8a7f249df13549b3373f24388cb5d2f1dd91cc18180b5"}, + {file = "google-auth-2.31.0.tar.gz", hash = "sha256:87805c36970047247c8afe614d4e3af8eceafc1ebba0c679fe75ddd1d575e871"}, + {file = "google_auth-2.31.0-py2.py3-none-any.whl", hash = "sha256:042c4702efa9f7d3c48d3a69341c209381b125faa6dbf3ebe56bc7e40ae05c23"}, ] [package.dependencies] @@ -1754,13 +1858,13 @@ files = [ [[package]] name = "ipykernel" -version = "6.29.4" +version = "6.29.5" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" files = [ - {file = "ipykernel-6.29.4-py3-none-any.whl", hash = "sha256:1181e653d95c6808039c509ef8e67c4126b3b3af7781496c7cbfb5ed938a27da"}, - {file = "ipykernel-6.29.4.tar.gz", hash = "sha256:3d44070060f9475ac2092b760123fadf105d2e2493c24848b6691a7c4f42af5c"}, + {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, + {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, ] [package.dependencies] @@ -1787,13 +1891,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.25.0" +version = "8.26.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.25.0-py3-none-any.whl", hash = "sha256:53eee7ad44df903a06655871cbab66d156a051fd86f3ec6750470ac9604ac1ab"}, - {file = "ipython-8.25.0.tar.gz", hash = "sha256:c6ed726a140b6e725b911528f80439c534fac915246af3efc39440a6b0f9d716"}, + {file = "ipython-8.26.0-py3-none-any.whl", hash = "sha256:e6b347c27bdf9c32ee9d31ae85defc525755a1869f14057e900675b9e8d6e6ff"}, + {file = "ipython-8.26.0.tar.gz", hash = "sha256:1cec0fbba8404af13facebe83d04436a7434c7400e59f47acf467c64abd0956c"}, ] [package.dependencies] @@ -1819,7 +1923,7 @@ nbformat = ["nbformat"] notebook = ["ipywidgets", "notebook"] parallel = ["ipyparallel"] qtconsole = ["qtconsole"] -test = ["pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] +test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] [[package]] @@ -2024,6 +2128,22 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpickle" +version = "3.2.2" +description = "Python library for serializing arbitrary object graphs into JSON" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jsonpickle-3.2.2-py3-none-any.whl", hash = "sha256:87cd82d237fd72c5a34970e7222dddc0accc13fddf49af84111887ed9a9445aa"}, + {file = "jsonpickle-3.2.2.tar.gz", hash = "sha256:d425fd2b8afe9f5d7d57205153403fbf897782204437882a477e8eed60930f8c"}, +] + +[package.extras] +docs = ["furo", "rst.linker (>=1.9)", "sphinx"] +packaging = ["build", "twine"] +testing = ["bson", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-benchmark", "pytest-benchmark[histogram]", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-ruff (>=0.2.1)", "scikit-learn", "scipy", "scipy (>=1.9.3)", "simplejson", "sqlalchemy", "ujson"] + [[package]] name = "jsonpointer" version = "3.0.0" @@ -2338,6 +2458,119 @@ files = [ {file = "jupyterlab_widgets-3.0.11.tar.gz", hash = "sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27"}, ] +[[package]] +name = "kiwisolver" +version = "1.4.5" +description = "A fast implementation of the Cassowary constraint solver" +optional = false +python-versions = ">=3.7" +files = [ + {file = "kiwisolver-1.4.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:05703cf211d585109fcd72207a31bb170a0f22144d68298dc5e61b3c946518af"}, + {file = "kiwisolver-1.4.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:146d14bebb7f1dc4d5fbf74f8a6cb15ac42baadee8912eb84ac0b3b2a3dc6ac3"}, + {file = "kiwisolver-1.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6ef7afcd2d281494c0a9101d5c571970708ad911d028137cd558f02b851c08b4"}, + {file = "kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9eaa8b117dc8337728e834b9c6e2611f10c79e38f65157c4c38e9400286f5cb1"}, + {file = "kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ec20916e7b4cbfb1f12380e46486ec4bcbaa91a9c448b97023fde0d5bbf9e4ff"}, + {file = "kiwisolver-1.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b42c68602539407884cf70d6a480a469b93b81b7701378ba5e2328660c847a"}, + {file = "kiwisolver-1.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa12042de0171fad672b6c59df69106d20d5596e4f87b5e8f76df757a7c399aa"}, + {file = "kiwisolver-1.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a40773c71d7ccdd3798f6489aaac9eee213d566850a9533f8d26332d626b82c"}, + {file = "kiwisolver-1.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:19df6e621f6d8b4b9c4d45f40a66839294ff2bb235e64d2178f7522d9170ac5b"}, + {file = "kiwisolver-1.4.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:83d78376d0d4fd884e2c114d0621624b73d2aba4e2788182d286309ebdeed770"}, + {file = "kiwisolver-1.4.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e391b1f0a8a5a10ab3b9bb6afcfd74f2175f24f8975fb87ecae700d1503cdee0"}, + {file = "kiwisolver-1.4.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:852542f9481f4a62dbb5dd99e8ab7aedfeb8fb6342349a181d4036877410f525"}, + {file = "kiwisolver-1.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59edc41b24031bc25108e210c0def6f6c2191210492a972d585a06ff246bb79b"}, + {file = "kiwisolver-1.4.5-cp310-cp310-win32.whl", hash = "sha256:a6aa6315319a052b4ee378aa171959c898a6183f15c1e541821c5c59beaa0238"}, + {file = "kiwisolver-1.4.5-cp310-cp310-win_amd64.whl", hash = "sha256:d0ef46024e6a3d79c01ff13801cb19d0cad7fd859b15037aec74315540acc276"}, + {file = "kiwisolver-1.4.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:11863aa14a51fd6ec28688d76f1735f8f69ab1fabf388851a595d0721af042f5"}, + {file = "kiwisolver-1.4.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8ab3919a9997ab7ef2fbbed0cc99bb28d3c13e6d4b1ad36e97e482558a91be90"}, + {file = "kiwisolver-1.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fcc700eadbbccbf6bc1bcb9dbe0786b4b1cb91ca0dcda336eef5c2beed37b797"}, + {file = "kiwisolver-1.4.5-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfdd7c0b105af050eb3d64997809dc21da247cf44e63dc73ff0fd20b96be55a9"}, + {file = "kiwisolver-1.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76c6a5964640638cdeaa0c359382e5703e9293030fe730018ca06bc2010c4437"}, + {file = "kiwisolver-1.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbea0db94288e29afcc4c28afbf3a7ccaf2d7e027489c449cf7e8f83c6346eb9"}, + {file = "kiwisolver-1.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ceec1a6bc6cab1d6ff5d06592a91a692f90ec7505d6463a88a52cc0eb58545da"}, + {file = "kiwisolver-1.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:040c1aebeda72197ef477a906782b5ab0d387642e93bda547336b8957c61022e"}, + {file = "kiwisolver-1.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f91de7223d4c7b793867797bacd1ee53bfe7359bd70d27b7b58a04efbb9436c8"}, + {file = "kiwisolver-1.4.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:faae4860798c31530dd184046a900e652c95513796ef51a12bc086710c2eec4d"}, + {file = "kiwisolver-1.4.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0157420efcb803e71d1b28e2c287518b8808b7cf1ab8af36718fd0a2c453eb0"}, + {file = "kiwisolver-1.4.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:06f54715b7737c2fecdbf140d1afb11a33d59508a47bf11bb38ecf21dc9ab79f"}, + {file = "kiwisolver-1.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fdb7adb641a0d13bdcd4ef48e062363d8a9ad4a182ac7647ec88f695e719ae9f"}, + {file = "kiwisolver-1.4.5-cp311-cp311-win32.whl", hash = "sha256:bb86433b1cfe686da83ce32a9d3a8dd308e85c76b60896d58f082136f10bffac"}, + {file = "kiwisolver-1.4.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c08e1312a9cf1074d17b17728d3dfce2a5125b2d791527f33ffbe805200a355"}, + {file = "kiwisolver-1.4.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:32d5cf40c4f7c7b3ca500f8985eb3fb3a7dfc023215e876f207956b5ea26632a"}, + {file = "kiwisolver-1.4.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f846c260f483d1fd217fe5ed7c173fb109efa6b1fc8381c8b7552c5781756192"}, + {file = "kiwisolver-1.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5ff5cf3571589b6d13bfbfd6bcd7a3f659e42f96b5fd1c4830c4cf21d4f5ef45"}, + {file = "kiwisolver-1.4.5-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7269d9e5f1084a653d575c7ec012ff57f0c042258bf5db0954bf551c158466e7"}, + {file = "kiwisolver-1.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da802a19d6e15dffe4b0c24b38b3af68e6c1a68e6e1d8f30148c83864f3881db"}, + {file = "kiwisolver-1.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3aba7311af82e335dd1e36ffff68aaca609ca6290c2cb6d821a39aa075d8e3ff"}, + {file = "kiwisolver-1.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:763773d53f07244148ccac5b084da5adb90bfaee39c197554f01b286cf869228"}, + {file = "kiwisolver-1.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2270953c0d8cdab5d422bee7d2007f043473f9d2999631c86a223c9db56cbd16"}, + {file = "kiwisolver-1.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d099e745a512f7e3bbe7249ca835f4d357c586d78d79ae8f1dcd4d8adeb9bda9"}, + {file = "kiwisolver-1.4.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:74db36e14a7d1ce0986fa104f7d5637aea5c82ca6326ed0ec5694280942d1162"}, + {file = "kiwisolver-1.4.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e5bab140c309cb3a6ce373a9e71eb7e4873c70c2dda01df6820474f9889d6d4"}, + {file = "kiwisolver-1.4.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0f114aa76dc1b8f636d077979c0ac22e7cd8f3493abbab152f20eb8d3cda71f3"}, + {file = "kiwisolver-1.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:88a2df29d4724b9237fc0c6eaf2a1adae0cdc0b3e9f4d8e7dc54b16812d2d81a"}, + {file = "kiwisolver-1.4.5-cp312-cp312-win32.whl", hash = "sha256:72d40b33e834371fd330fb1472ca19d9b8327acb79a5821d4008391db8e29f20"}, + {file = "kiwisolver-1.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:2c5674c4e74d939b9d91dda0fae10597ac7521768fec9e399c70a1f27e2ea2d9"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3a2b053a0ab7a3960c98725cfb0bf5b48ba82f64ec95fe06f1d06c99b552e130"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cd32d6c13807e5c66a7cbb79f90b553642f296ae4518a60d8d76243b0ad2898"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59ec7b7c7e1a61061850d53aaf8e93db63dce0c936db1fda2658b70e4a1be709"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da4cfb373035def307905d05041c1d06d8936452fe89d464743ae7fb8371078b"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2400873bccc260b6ae184b2b8a4fec0e4082d30648eadb7c3d9a13405d861e89"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1b04139c4236a0f3aff534479b58f6f849a8b351e1314826c2d230849ed48985"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:4e66e81a5779b65ac21764c295087de82235597a2293d18d943f8e9e32746265"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7931d8f1f67c4be9ba1dd9c451fb0eeca1a25b89e4d3f89e828fe12a519b782a"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:b3f7e75f3015df442238cca659f8baa5f42ce2a8582727981cbfa15fee0ee205"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:bbf1d63eef84b2e8c89011b7f2235b1e0bf7dacc11cac9431fc6468e99ac77fb"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4c380469bd3f970ef677bf2bcba2b6b0b4d5c75e7a020fb863ef75084efad66f"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-win32.whl", hash = "sha256:9408acf3270c4b6baad483865191e3e582b638b1654a007c62e3efe96f09a9a3"}, + {file = "kiwisolver-1.4.5-cp37-cp37m-win_amd64.whl", hash = "sha256:5b94529f9b2591b7af5f3e0e730a4e0a41ea174af35a4fd067775f9bdfeee01a"}, + {file = "kiwisolver-1.4.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:11c7de8f692fc99816e8ac50d1d1aef4f75126eefc33ac79aac02c099fd3db71"}, + {file = "kiwisolver-1.4.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:53abb58632235cd154176ced1ae8f0d29a6657aa1aa9decf50b899b755bc2b93"}, + {file = "kiwisolver-1.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:88b9f257ca61b838b6f8094a62418421f87ac2a1069f7e896c36a7d86b5d4c29"}, + {file = "kiwisolver-1.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3195782b26fc03aa9c6913d5bad5aeb864bdc372924c093b0f1cebad603dd712"}, + {file = "kiwisolver-1.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc579bf0f502e54926519451b920e875f433aceb4624a3646b3252b5caa9e0b6"}, + {file = "kiwisolver-1.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a580c91d686376f0f7c295357595c5a026e6cbc3d77b7c36e290201e7c11ecb"}, + {file = "kiwisolver-1.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cfe6ab8da05c01ba6fbea630377b5da2cd9bcbc6338510116b01c1bc939a2c18"}, + {file = "kiwisolver-1.4.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d2e5a98f0ec99beb3c10e13b387f8db39106d53993f498b295f0c914328b1333"}, + {file = "kiwisolver-1.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a51a263952b1429e429ff236d2f5a21c5125437861baeed77f5e1cc2d2c7c6da"}, + {file = "kiwisolver-1.4.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3edd2fa14e68c9be82c5b16689e8d63d89fe927e56debd6e1dbce7a26a17f81b"}, + {file = "kiwisolver-1.4.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:74d1b44c6cfc897df648cc9fdaa09bc3e7679926e6f96df05775d4fb3946571c"}, + {file = "kiwisolver-1.4.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76d9289ed3f7501012e05abb8358bbb129149dbd173f1f57a1bf1c22d19ab7cc"}, + {file = "kiwisolver-1.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:92dea1ffe3714fa8eb6a314d2b3c773208d865a0e0d35e713ec54eea08a66250"}, + {file = "kiwisolver-1.4.5-cp38-cp38-win32.whl", hash = "sha256:5c90ae8c8d32e472be041e76f9d2f2dbff4d0b0be8bd4041770eddb18cf49a4e"}, + {file = "kiwisolver-1.4.5-cp38-cp38-win_amd64.whl", hash = "sha256:c7940c1dc63eb37a67721b10d703247552416f719c4188c54e04334321351ced"}, + {file = "kiwisolver-1.4.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9407b6a5f0d675e8a827ad8742e1d6b49d9c1a1da5d952a67d50ef5f4170b18d"}, + {file = "kiwisolver-1.4.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15568384086b6df3c65353820a4473575dbad192e35010f622c6ce3eebd57af9"}, + {file = "kiwisolver-1.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0dc9db8e79f0036e8173c466d21ef18e1befc02de8bf8aa8dc0813a6dc8a7046"}, + {file = "kiwisolver-1.4.5-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:cdc8a402aaee9a798b50d8b827d7ecf75edc5fb35ea0f91f213ff927c15f4ff0"}, + {file = "kiwisolver-1.4.5-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6c3bd3cde54cafb87d74d8db50b909705c62b17c2099b8f2e25b461882e544ff"}, + {file = "kiwisolver-1.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:955e8513d07a283056b1396e9a57ceddbd272d9252c14f154d450d227606eb54"}, + {file = "kiwisolver-1.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:346f5343b9e3f00b8db8ba359350eb124b98c99efd0b408728ac6ebf38173958"}, + {file = "kiwisolver-1.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9098e0049e88c6a24ff64545cdfc50807818ba6c1b739cae221bbbcbc58aad3"}, + {file = "kiwisolver-1.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:00bd361b903dc4bbf4eb165f24d1acbee754fce22ded24c3d56eec268658a5cf"}, + {file = "kiwisolver-1.4.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7b8b454bac16428b22560d0a1cf0a09875339cab69df61d7805bf48919415901"}, + {file = "kiwisolver-1.4.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:f1d072c2eb0ad60d4c183f3fb44ac6f73fb7a8f16a2694a91f988275cbf352f9"}, + {file = "kiwisolver-1.4.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:31a82d498054cac9f6d0b53d02bb85811185bcb477d4b60144f915f3b3126342"}, + {file = "kiwisolver-1.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6512cb89e334e4700febbffaaa52761b65b4f5a3cf33f960213d5656cea36a77"}, + {file = "kiwisolver-1.4.5-cp39-cp39-win32.whl", hash = "sha256:9db8ea4c388fdb0f780fe91346fd438657ea602d58348753d9fb265ce1bca67f"}, + {file = "kiwisolver-1.4.5-cp39-cp39-win_amd64.whl", hash = "sha256:59415f46a37f7f2efeec758353dd2eae1b07640d8ca0f0c42548ec4125492635"}, + {file = "kiwisolver-1.4.5-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5c7b3b3a728dc6faf3fc372ef24f21d1e3cee2ac3e9596691d746e5a536de920"}, + {file = "kiwisolver-1.4.5-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:620ced262a86244e2be10a676b646f29c34537d0d9cc8eb26c08f53d98013390"}, + {file = "kiwisolver-1.4.5-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:378a214a1e3bbf5ac4a8708304318b4f890da88c9e6a07699c4ae7174c09a68d"}, + {file = "kiwisolver-1.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf7be1207676ac608a50cd08f102f6742dbfc70e8d60c4db1c6897f62f71523"}, + {file = "kiwisolver-1.4.5-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ba55dce0a9b8ff59495ddd050a0225d58bd0983d09f87cfe2b6aec4f2c1234e4"}, + {file = "kiwisolver-1.4.5-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fd32ea360bcbb92d28933fc05ed09bffcb1704ba3fc7942e81db0fd4f81a7892"}, + {file = "kiwisolver-1.4.5-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5e7139af55d1688f8b960ee9ad5adafc4ac17c1c473fe07133ac092310d76544"}, + {file = "kiwisolver-1.4.5-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dced8146011d2bc2e883f9bd68618b8247387f4bbec46d7392b3c3b032640126"}, + {file = "kiwisolver-1.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9bf3325c47b11b2e51bca0824ea217c7cd84491d8ac4eefd1e409705ef092bd"}, + {file = "kiwisolver-1.4.5-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5794cf59533bc3f1b1c821f7206a3617999db9fbefc345360aafe2e067514929"}, + {file = "kiwisolver-1.4.5-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e368f200bbc2e4f905b8e71eb38b3c04333bddaa6a2464a6355487b02bb7fb09"}, + {file = "kiwisolver-1.4.5-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d706eba36b4c4d5bc6c6377bb6568098765e990cfc21ee16d13963fab7b3e7"}, + {file = "kiwisolver-1.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85267bd1aa8880a9c88a8cb71e18d3d64d2751a790e6ca6c27b8ccc724bcd5ad"}, + {file = "kiwisolver-1.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:210ef2c3a1f03272649aff1ef992df2e724748918c4bc2d5a90352849eb40bea"}, + {file = "kiwisolver-1.4.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:11d011a7574eb3b82bcc9c1a1d35c1d7075677fdd15de527d91b46bd35e935ee"}, + {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"}, +] + [[package]] name = "langchain" version = "0.2.6" @@ -2414,13 +2647,13 @@ tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" [[package]] name = "langchain-openai" -version = "0.1.11" +version = "0.1.13" description = "An integration package connecting OpenAI and LangChain" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_openai-0.1.11-py3-none-any.whl", hash = "sha256:12676e02846db63938ed40177d3271c8419a0659023afaee6da42518a0a28630"}, - {file = "langchain_openai-0.1.11.tar.gz", hash = "sha256:a5901bfee091060b7915fd6de6fd98aa429544de8cab51ac9b13ba484d9caffb"}, + {file = "langchain_openai-0.1.13-py3-none-any.whl", hash = "sha256:4344b6c5c67088a28eed80ba763157fdd1d690cee679966a021b42f305dbf7b5"}, + {file = "langchain_openai-0.1.13.tar.gz", hash = "sha256:03318669bcb3238f7d1bb043329f91d150ca09246f1faf569ef299f535405c71"}, ] [package.dependencies] @@ -2484,7 +2717,7 @@ typing = ["mypy (>=1.0.0)", "types-setuptools"] [[package]] name = "lightrag" -version = "0.1.0" +version = "0.0.0-alpha.6" description = "The 'PyTorch' library for LLM applications. RAG=Retriever-Agent-Generator." optional = false python-versions = ">=3.10, <4.0" @@ -2493,13 +2726,9 @@ develop = true [package.dependencies] backoff = "^2.2.1" -faiss-cpu = "^1.8.0" -groq = "^0.5.0" jinja2 = "^3.1.3" jsonlines = "^4.0.0" -more-itertools = "^10.3.0" numpy = "^1.26.4" -openai = "^1.12.0" python-dotenv = "^1.0.1" tiktoken = "^0.7.0" @@ -2524,19 +2753,19 @@ pydantic = ">=1.10" [[package]] name = "llama-index" -version = "0.10.50" +version = "0.10.51" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index-0.10.50-py3-none-any.whl", hash = "sha256:18426f1f23378bde506646a6e016c11b7e99f947bcc9c768384ae776da2aeab7"}, - {file = "llama_index-0.10.50.tar.gz", hash = "sha256:57f8cbd3d981f68ebafff97268f4afb53a1f9f4af0bb748858b14310336b2a4d"}, + {file = "llama_index-0.10.51-py3-none-any.whl", hash = "sha256:dd7c01fdeb103ee177681bb240c9a0c1d3f9ce34cb797f4f6cb991b2d1bd4f4e"}, + {file = "llama_index-0.10.51.tar.gz", hash = "sha256:4b23e946e7fd80c7e6f8c6323c44b2e413301a485ca1294caf50145c2629c05d"}, ] [package.dependencies] llama-index-agent-openai = ">=0.1.4,<0.3.0" llama-index-cli = ">=0.1.2,<0.2.0" -llama-index-core = "0.10.50" +llama-index-core = "0.10.51" llama-index-embeddings-openai = ">=0.1.5,<0.2.0" llama-index-indices-managed-llama-cloud = ">=0.2.0" llama-index-legacy = ">=0.9.48,<0.10.0" @@ -2581,13 +2810,13 @@ llama-index-llms-openai = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-core" -version = "0.10.50" +version = "0.10.51" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_core-0.10.50-py3-none-any.whl", hash = "sha256:8a1fc49d65156002bb7f4c82458385ee440bd11f81d1bc39ed03ce94f53c9c74"}, - {file = "llama_index_core-0.10.50.tar.gz", hash = "sha256:9e64dd54b19aca235b360b2f81d6f93014f1439304cc9369a0e53e4cba9e33aa"}, + {file = "llama_index_core-0.10.51-py3-none-any.whl", hash = "sha256:34051f188258bfb71335e018dcdf54291ef4feda15cadebb9be0402e98bd1d27"}, + {file = "llama_index_core-0.10.51.tar.gz", hash = "sha256:40b7052a6127810032b2d0b6abd72dd1767d8dd589c0f13747c4307942e81dd5"}, ] [package.dependencies] @@ -2631,13 +2860,13 @@ llama-index-core = ">=0.10.1,<0.11.0" [[package]] name = "llama-index-indices-managed-llama-cloud" -version = "0.2.1" +version = "0.2.2" description = "llama-index indices llama-cloud integration" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_indices_managed_llama_cloud-0.2.1-py3-none-any.whl", hash = "sha256:69abd37bc7b57abcea841eea2a89cb0adee29bce3fd05c61e3082ae50f047b87"}, - {file = "llama_index_indices_managed_llama_cloud-0.2.1.tar.gz", hash = "sha256:b07fa606f1085e22918d2d45e00ab86f3430f36057e115322bd360b695eef565"}, + {file = "llama_index_indices_managed_llama_cloud-0.2.2-py3-none-any.whl", hash = "sha256:30c73a77fc54fa83c4a183fcdc3b5138a6b709a6fefc9539d0cb0c6315b0f2fc"}, + {file = "llama_index_indices_managed_llama_cloud-0.2.2.tar.gz", hash = "sha256:9a3db075878bc7adf798a74ec4d6220dec5421f46c0675702a94894934d17a7a"}, ] [package.dependencies] @@ -2685,13 +2914,13 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l [[package]] name = "llama-index-llms-openai" -version = "0.1.23" +version = "0.1.24" description = "llama-index llms openai integration" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_llms_openai-0.1.23-py3-none-any.whl", hash = "sha256:38753baac823a0459b8f6511258d84020219cb6b223a9866ec526e83ddbc94e1"}, - {file = "llama_index_llms_openai-0.1.23.tar.gz", hash = "sha256:b40289c47fda9df86c8177999d6af0a47fce14fe4324572ea2fe25bbdbd05021"}, + {file = "llama_index_llms_openai-0.1.24-py3-none-any.whl", hash = "sha256:c7b71cd34765e2d080d5eaf23c602877cc74fea162b59d53965273b2d4c4a56a"}, + {file = "llama_index_llms_openai-0.1.24.tar.gz", hash = "sha256:9031bd155c303f89cc51cfcc75d7d6f12fffa4274f2f9c7f67d4140350d13d56"}, ] [package.dependencies] @@ -2799,13 +3028,13 @@ sqlalchemy = {version = ">=1.4.49,<2.1", extras = ["asyncio"]} [[package]] name = "llama-parse" -version = "0.4.4" +version = "0.4.5" description = "Parse files into RAG-Optimized formats." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_parse-0.4.4-py3-none-any.whl", hash = "sha256:bb9724d04fd31ed037000896c7cef7fcb9051325497db4592a15f8144754cd00"}, - {file = "llama_parse-0.4.4.tar.gz", hash = "sha256:b45c2db33a0d6b7a2d5f59e3d0ec7ee7f8227a852eaa56b04aa12b12f2c0d521"}, + {file = "llama_parse-0.4.5-py3-none-any.whl", hash = "sha256:a68fc91a2b0bce98a4960b8f709ca3c2f90b421da66e0d8522f0ea45b78846b9"}, + {file = "llama_parse-0.4.5.tar.gz", hash = "sha256:08a48bcf4af5b623bf26fa6266038572b95409f7be64746067db8d38f6927fe5"}, ] [package.dependencies] @@ -2899,6 +3128,58 @@ dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"] docs = ["alabaster (==0.7.16)", "autodocsumm (==0.2.12)", "sphinx (==7.3.7)", "sphinx-issues (==4.1.0)", "sphinx-version-warning (==1.1.2)"] tests = ["pytest", "pytz", "simplejson"] +[[package]] +name = "matplotlib" +version = "3.9.0" +description = "Python plotting package" +optional = false +python-versions = ">=3.9" +files = [ + {file = "matplotlib-3.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2bcee1dffaf60fe7656183ac2190bd630842ff87b3153afb3e384d966b57fe56"}, + {file = "matplotlib-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3f988bafb0fa39d1074ddd5bacd958c853e11def40800c5824556eb630f94d3b"}, + {file = "matplotlib-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe428e191ea016bb278758c8ee82a8129c51d81d8c4bc0846c09e7e8e9057241"}, + {file = "matplotlib-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaf3978060a106fab40c328778b148f590e27f6fa3cd15a19d6892575bce387d"}, + {file = "matplotlib-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e7f03e5cbbfacdd48c8ea394d365d91ee8f3cae7e6ec611409927b5ed997ee4"}, + {file = "matplotlib-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:13beb4840317d45ffd4183a778685e215939be7b08616f431c7795276e067463"}, + {file = "matplotlib-3.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:063af8587fceeac13b0936c42a2b6c732c2ab1c98d38abc3337e430e1ff75e38"}, + {file = "matplotlib-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9a2fa6d899e17ddca6d6526cf6e7ba677738bf2a6a9590d702c277204a7c6152"}, + {file = "matplotlib-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:550cdda3adbd596078cca7d13ed50b77879104e2e46392dcd7c75259d8f00e85"}, + {file = "matplotlib-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76cce0f31b351e3551d1f3779420cf8f6ec0d4a8cf9c0237a3b549fd28eb4abb"}, + {file = "matplotlib-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c53aeb514ccbbcbab55a27f912d79ea30ab21ee0531ee2c09f13800efb272674"}, + {file = "matplotlib-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5be985db2596d761cdf0c2eaf52396f26e6a64ab46bd8cd810c48972349d1be"}, + {file = "matplotlib-3.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c79f3a585f1368da6049318bdf1f85568d8d04b2e89fc24b7e02cc9b62017382"}, + {file = "matplotlib-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bdd1ecbe268eb3e7653e04f451635f0fb0f77f07fd070242b44c076c9106da84"}, + {file = "matplotlib-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d38e85a1a6d732f645f1403ce5e6727fd9418cd4574521d5803d3d94911038e5"}, + {file = "matplotlib-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a490715b3b9984fa609116481b22178348c1a220a4499cda79132000a79b4db"}, + {file = "matplotlib-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8146ce83cbc5dc71c223a74a1996d446cd35cfb6a04b683e1446b7e6c73603b7"}, + {file = "matplotlib-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:d91a4ffc587bacf5c4ce4ecfe4bcd23a4b675e76315f2866e588686cc97fccdf"}, + {file = "matplotlib-3.9.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:616fabf4981a3b3c5a15cd95eba359c8489c4e20e03717aea42866d8d0465956"}, + {file = "matplotlib-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cd53c79fd02f1c1808d2cfc87dd3cf4dbc63c5244a58ee7944497107469c8d8a"}, + {file = "matplotlib-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06a478f0d67636554fa78558cfbcd7b9dba85b51f5c3b5a0c9be49010cf5f321"}, + {file = "matplotlib-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81c40af649d19c85f8073e25e5806926986806fa6d54be506fbf02aef47d5a89"}, + {file = "matplotlib-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52146fc3bd7813cc784562cb93a15788be0b2875c4655e2cc6ea646bfa30344b"}, + {file = "matplotlib-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:0fc51eaa5262553868461c083d9adadb11a6017315f3a757fc45ec6ec5f02888"}, + {file = "matplotlib-3.9.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bd4f2831168afac55b881db82a7730992aa41c4f007f1913465fb182d6fb20c0"}, + {file = "matplotlib-3.9.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:290d304e59be2b33ef5c2d768d0237f5bd132986bdcc66f80bc9bcc300066a03"}, + {file = "matplotlib-3.9.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ff2e239c26be4f24bfa45860c20ffccd118d270c5b5d081fa4ea409b5469fcd"}, + {file = "matplotlib-3.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:af4001b7cae70f7eaacfb063db605280058246de590fa7874f00f62259f2df7e"}, + {file = "matplotlib-3.9.0.tar.gz", hash = "sha256:e6d29ea6c19e34b30fb7d88b7081f869a03014f66fe06d62cc77d5a6ea88ed7a"}, +] + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.3.1" +numpy = ">=1.23" +packaging = ">=20.0" +pillow = ">=8" +pyparsing = ">=2.3.1" +python-dateutil = ">=2.7" + +[package.extras] +dev = ["meson-python (>=0.13.1)", "numpy (>=1.25)", "pybind11 (>=2.6)", "setuptools (>=64)", "setuptools_scm (>=7)"] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -2942,17 +3223,6 @@ files = [ intel-openmp = "==2021.*" tbb = "==2021.*" -[[package]] -name = "more-itertools" -version = "10.3.0" -description = "More routines for operating on iterables, beyond itertools" -optional = false -python-versions = ">=3.8" -files = [ - {file = "more-itertools-10.3.0.tar.gz", hash = "sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463"}, - {file = "more_itertools-10.3.0-py3-none-any.whl", hash = "sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320"}, -] - [[package]] name = "mpmath" version = "1.3.0" @@ -3475,13 +3745,13 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.5.40" +version = "12.5.82" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] [[package]] @@ -3750,84 +4020,95 @@ numpy = "*" [[package]] name = "pillow" -version = "10.3.0" +version = "10.4.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.8" files = [ - {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"}, - {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"}, - {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"}, - {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"}, - {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"}, - {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"}, - {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"}, - {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"}, - {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"}, - {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"}, - {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"}, - {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"}, - {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"}, - {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"}, - {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"}, - {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"}, - {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, ] [package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] fpx = ["olefile"] mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] @@ -4195,109 +4476,121 @@ files = [ [[package]] name = "pydantic" -version = "2.7.4" +version = "2.8.0" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, - {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, + {file = "pydantic-2.8.0-py3-none-any.whl", hash = "sha256:ead4f3a1e92386a734ca1411cb25d94147cf8778ed5be6b56749047676d6364e"}, + {file = "pydantic-2.8.0.tar.gz", hash = "sha256:d970ffb9d030b710795878940bd0489842c638e7252fc4a19c3ae2f7da4d6141"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.4" -typing-extensions = ">=4.6.1" +pydantic-core = "2.20.0" +typing-extensions = [ + {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, +] [package.extras] email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.4" +version = "2.20.0" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, - {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, - {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, - {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, - {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, - {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, - {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, - {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, - {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, - {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, - {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, - {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, - {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, - {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, + {file = "pydantic_core-2.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e9dcd7fb34f7bfb239b5fa420033642fff0ad676b765559c3737b91f664d4fa9"}, + {file = "pydantic_core-2.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:649a764d9b0da29816889424697b2a3746963ad36d3e0968784ceed6e40c6355"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7701df088d0b05f3460f7ba15aec81ac8b0fb5690367dfd072a6c38cf5b7fdb5"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ab760f17c3e792225cdaef31ca23c0aea45c14ce80d8eff62503f86a5ab76bff"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cb1ad5b4d73cde784cf64580166568074f5ccd2548d765e690546cff3d80937d"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b81ec2efc04fc1dbf400647d4357d64fb25543bae38d2d19787d69360aad21c9"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4a9732a5cad764ba37f3aa873dccb41b584f69c347a57323eda0930deec8e10"}, + {file = "pydantic_core-2.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dc85b9e10cc21d9c1055f15684f76fa4facadddcb6cd63abab702eb93c98943"}, + {file = "pydantic_core-2.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:21d9f7e24f63fdc7118e6cc49defaab8c1d27570782f7e5256169d77498cf7c7"}, + {file = "pydantic_core-2.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8b315685832ab9287e6124b5d74fc12dda31e6421d7f6b08525791452844bc2d"}, + {file = "pydantic_core-2.20.0-cp310-none-win32.whl", hash = "sha256:c3dc8ec8b87c7ad534c75b8855168a08a7036fdb9deeeed5705ba9410721c84d"}, + {file = "pydantic_core-2.20.0-cp310-none-win_amd64.whl", hash = "sha256:85770b4b37bb36ef93a6122601795231225641003e0318d23c6233c59b424279"}, + {file = "pydantic_core-2.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:58e251bb5a5998f7226dc90b0b753eeffa720bd66664eba51927c2a7a2d5f32c"}, + {file = "pydantic_core-2.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:78d584caac52c24240ef9ecd75de64c760bbd0e20dbf6973631815e3ef16ef8b"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5084ec9721f82bef5ff7c4d1ee65e1626783abb585f8c0993833490b63fe1792"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6d0f52684868db7c218437d260e14d37948b094493f2646f22d3dda7229bbe3f"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1def125d59a87fe451212a72ab9ed34c118ff771e5473fef4f2f95d8ede26d75"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b34480fd6778ab356abf1e9086a4ced95002a1e195e8d2fd182b0def9d944d11"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d42669d319db366cb567c3b444f43caa7ffb779bf9530692c6f244fc635a41eb"}, + {file = "pydantic_core-2.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53b06aea7a48919a254b32107647be9128c066aaa6ee6d5d08222325f25ef175"}, + {file = "pydantic_core-2.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1f038156b696a1c39d763b2080aeefa87ddb4162c10aa9fabfefffc3dd8180fa"}, + {file = "pydantic_core-2.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3f0f3a4a23717280a5ee3ac4fb1f81d6fde604c9ec5100f7f6f987716bb8c137"}, + {file = "pydantic_core-2.20.0-cp311-none-win32.whl", hash = "sha256:316fe7c3fec017affd916a0c83d6f1ec697cbbbdf1124769fa73328e7907cc2e"}, + {file = "pydantic_core-2.20.0-cp311-none-win_amd64.whl", hash = "sha256:2d06a7fa437f93782e3f32d739c3ec189f82fca74336c08255f9e20cea1ed378"}, + {file = "pydantic_core-2.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d6f8c49657f3eb7720ed4c9b26624063da14937fc94d1812f1e04a2204db3e17"}, + {file = "pydantic_core-2.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ad1bd2f377f56fec11d5cfd0977c30061cd19f4fa199bf138b200ec0d5e27eeb"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed741183719a5271f97d93bbcc45ed64619fa38068aaa6e90027d1d17e30dc8d"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d82e5ed3a05f2dcb89c6ead2fd0dbff7ac09bc02c1b4028ece2d3a3854d049ce"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2ba34a099576234671f2e4274e5bc6813b22e28778c216d680eabd0db3f7dad"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:879ae6bb08a063b3e1b7ac8c860096d8fd6b48dd9b2690b7f2738b8c835e744b"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b0eefc7633a04c0694340aad91fbfd1986fe1a1e0c63a22793ba40a18fcbdc8"}, + {file = "pydantic_core-2.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73deadd6fd8a23e2f40b412b3ac617a112143c8989a4fe265050fd91ba5c0608"}, + {file = "pydantic_core-2.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:35681445dc85446fb105943d81ae7569aa7e89de80d1ca4ac3229e05c311bdb1"}, + {file = "pydantic_core-2.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0f6dd3612a3b9f91f2e63924ea18a4476656c6d01843ca20a4c09e00422195af"}, + {file = "pydantic_core-2.20.0-cp312-none-win32.whl", hash = "sha256:7e37b6bb6e90c2b8412b06373c6978d9d81e7199a40e24a6ef480e8acdeaf918"}, + {file = "pydantic_core-2.20.0-cp312-none-win_amd64.whl", hash = "sha256:7d4df13d1c55e84351fab51383520b84f490740a9f1fec905362aa64590b7a5d"}, + {file = "pydantic_core-2.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:d43e7ab3b65e4dc35a7612cfff7b0fd62dce5bc11a7cd198310b57f39847fd6c"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b6a24d7b5893392f2b8e3b7a0031ae3b14c6c1942a4615f0d8794fdeeefb08b"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b2f13c3e955a087c3ec86f97661d9f72a76e221281b2262956af381224cfc243"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:72432fd6e868c8d0a6849869e004b8bcae233a3c56383954c228316694920b38"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d70a8ff2d4953afb4cbe6211f17268ad29c0b47e73d3372f40e7775904bc28fc"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e49524917b8d3c2f42cd0d2df61178e08e50f5f029f9af1f402b3ee64574392"}, + {file = "pydantic_core-2.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4f0f71653b1c1bad0350bc0b4cc057ab87b438ff18fa6392533811ebd01439c"}, + {file = "pydantic_core-2.20.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:16197e6f4fdecb9892ed2436e507e44f0a1aa2cff3b9306d1c879ea2f9200997"}, + {file = "pydantic_core-2.20.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:763602504bf640b3ded3bba3f8ed8a1cc2fc6a87b8d55c1c5689f428c49c947e"}, + {file = "pydantic_core-2.20.0-cp313-none-win32.whl", hash = "sha256:a3f243f318bd9523277fa123b3163f4c005a3e8619d4b867064de02f287a564d"}, + {file = "pydantic_core-2.20.0-cp313-none-win_amd64.whl", hash = "sha256:03aceaf6a5adaad3bec2233edc5a7905026553916615888e53154807e404545c"}, + {file = "pydantic_core-2.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d6f2d8b8da1f03f577243b07bbdd3412eee3d37d1f2fd71d1513cbc76a8c1239"}, + {file = "pydantic_core-2.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a272785a226869416c6b3c1b7e450506152d3844207331f02f27173562c917e0"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efbb412d55a4ffe73963fed95c09ccb83647ec63b711c4b3752be10a56f0090b"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1e4f46189d8740561b43655263a41aac75ff0388febcb2c9ec4f1b60a0ec12f3"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87d3df115f4a3c8c5e4d5acf067d399c6466d7e604fc9ee9acbe6f0c88a0c3cf"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a340d2bdebe819d08f605e9705ed551c3feb97e4fd71822d7147c1e4bdbb9508"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:616b9c2f882393d422ba11b40e72382fe975e806ad693095e9a3b67c59ea6150"}, + {file = "pydantic_core-2.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:25c46bb2ff6084859bbcfdf4f1a63004b98e88b6d04053e8bf324e115398e9e7"}, + {file = "pydantic_core-2.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:23425eccef8f2c342f78d3a238c824623836c6c874d93c726673dbf7e56c78c0"}, + {file = "pydantic_core-2.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:52527e8f223ba29608d999d65b204676398009725007c9336651c2ec2d93cffc"}, + {file = "pydantic_core-2.20.0-cp38-none-win32.whl", hash = "sha256:1c3c5b7f70dd19a6845292b0775295ea81c61540f68671ae06bfe4421b3222c2"}, + {file = "pydantic_core-2.20.0-cp38-none-win_amd64.whl", hash = "sha256:8093473d7b9e908af1cef30025609afc8f5fd2a16ff07f97440fd911421e4432"}, + {file = "pydantic_core-2.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ee7785938e407418795e4399b2bf5b5f3cf6cf728077a7f26973220d58d885cf"}, + {file = "pydantic_core-2.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e75794883d635071cf6b4ed2a5d7a1e50672ab7a051454c76446ef1ebcdcc91"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:344e352c96e53b4f56b53d24728217c69399b8129c16789f70236083c6ceb2ac"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:978d4123ad1e605daf1ba5e01d4f235bcf7b6e340ef07e7122e8e9cfe3eb61ab"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c05eaf6c863781eb834ab41f5963604ab92855822a2062897958089d1335dad"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bc7e43b4a528ffca8c9151b6a2ca34482c2fdc05e6aa24a84b7f475c896fc51d"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:658287a29351166510ebbe0a75c373600cc4367a3d9337b964dada8d38bcc0f4"}, + {file = "pydantic_core-2.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1dacf660d6de692fe351e8c806e7efccf09ee5184865893afbe8e59be4920b4a"}, + {file = "pydantic_core-2.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3e147fc6e27b9a487320d78515c5f29798b539179f7777018cedf51b7749e4f4"}, + {file = "pydantic_core-2.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c867230d715a3dd1d962c8d9bef0d3168994ed663e21bf748b6e3a529a129aab"}, + {file = "pydantic_core-2.20.0-cp39-none-win32.whl", hash = "sha256:22b813baf0dbf612752d8143a2dbf8e33ccb850656b7850e009bad2e101fc377"}, + {file = "pydantic_core-2.20.0-cp39-none-win_amd64.whl", hash = "sha256:3a7235b46c1bbe201f09b6f0f5e6c36b16bad3d0532a10493742f91fbdc8035f"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cafde15a6f7feaec2f570646e2ffc5b73412295d29134a29067e70740ec6ee20"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:2aec8eeea0b08fd6bc2213d8e86811a07491849fd3d79955b62d83e32fa2ad5f"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:840200827984f1c4e114008abc2f5ede362d6e11ed0b5931681884dd41852ff1"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ea1d8b7df522e5ced34993c423c3bf3735c53df8b2a15688a2f03a7d678800"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5b8376a867047bf08910573deb95d3c8dfb976eb014ee24f3b5a61ccc5bee1b"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d08264b4460326cefacc179fc1411304d5af388a79910832835e6f641512358b"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7a3639011c2e8a9628466f616ed7fb413f30032b891898e10895a0a8b5857d6c"}, + {file = "pydantic_core-2.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05e83ce2f7eba29e627dd8066aa6c4c0269b2d4f889c0eba157233a353053cea"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:603a843fea76a595c8f661cd4da4d2281dff1e38c4a836a928eac1a2f8fe88e4"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ac76f30d5d3454f4c28826d891fe74d25121a346c69523c9810ebba43f3b1cec"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e3b1d4b1b3f6082849f9b28427ef147a5b46a6132a3dbaf9ca1baa40c88609"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2761f71faed820e25ec62eacba670d1b5c2709bb131a19fcdbfbb09884593e5a"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a0586cddbf4380e24569b8a05f234e7305717cc8323f50114dfb2051fcbce2a3"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:b8c46a8cf53e849eea7090f331ae2202cd0f1ceb090b00f5902c423bd1e11805"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b4a085bd04af7245e140d1b95619fe8abb445a3d7fdf219b3f80c940853268ef"}, + {file = "pydantic_core-2.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:116b326ac82c8b315e7348390f6d30bcfe6e688a7d3f1de50ff7bcc2042a23c2"}, + {file = "pydantic_core-2.20.0.tar.gz", hash = "sha256:366be8e64e0cb63d87cf79b4e1765c0703dd6313c729b22e7b9e378db6b96877"}, ] [package.dependencies] @@ -4331,6 +4624,25 @@ doc = ["ablog (>=0.11.8)", "colorama", "graphviz", "ipykernel", "ipyleaflet", "i i18n = ["Babel", "jinja2"] test = ["pytest", "pytest-cov", "pytest-regressions", "sphinx[test]"] +[[package]] +name = "pydot" +version = "2.0.0" +description = "Python interface to Graphviz's Dot" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pydot-2.0.0-py3-none-any.whl", hash = "sha256:408a47913ea7bd5d2d34b274144880c1310c4aee901f353cf21fe2e526a4ea28"}, + {file = "pydot-2.0.0.tar.gz", hash = "sha256:60246af215123fa062f21cd791be67dda23a6f280df09f68919e637a1e4f3235"}, +] + +[package.dependencies] +pyparsing = ">=3" + +[package.extras] +dev = ["black", "chardet"] +release = ["zest.releaser[recommended]"] +tests = ["black", "chardet", "tox"] + [[package]] name = "pygments" version = "2.18.0" @@ -4427,6 +4739,22 @@ files = [ {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, ] +[[package]] +name = "pyvis" +version = "0.3.2" +description = "A Python network graph visualization library" +optional = false +python-versions = ">3.6" +files = [ + {file = "pyvis-0.3.2-py3-none-any.whl", hash = "sha256:5720c4ca8161dc5d9ab352015723abb7a8bb8fb443edeb07f7a322db34a97555"}, +] + +[package.dependencies] +ipython = ">=5.3.0" +jinja2 = ">=2.9.6" +jsonpickle = ">=1.4.1" +networkx = ">=1.11" + [[package]] name = "pywin32" version = "306" @@ -4684,6 +5012,17 @@ numpy = "*" [package.extras] dev = ["pytest"] +[[package]] +name = "readthedocs-sphinx-search" +version = "0.3.2" +description = "Sphinx extension to enable search as you type for docs hosted on Read the Docs." +optional = false +python-versions = ">=3.6" +files = [ + {file = "readthedocs-sphinx-search-0.3.2.tar.gz", hash = "sha256:277773bfa28566a86694c08e568d5a648cd80f22826545555a764d6d20c365fb"}, + {file = "readthedocs_sphinx_search-0.3.2-py3-none-any.whl", hash = "sha256:58716fd21f01581e6e67bf3bc02e79c77e10dc58b5f8e4c7cc1977e013eda173"}, +] + [[package]] name = "referencing" version = "0.35.1" @@ -5224,18 +5563,18 @@ train = ["accelerate (>=0.20.3)", "datasets"] [[package]] name = "setuptools" -version = "70.1.1" +version = "70.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.1-py3-none-any.whl", hash = "sha256:a58a8fde0541dab0419750bcc521fbdf8585f6e5cb41909df3a472ef7b81ca95"}, - {file = "setuptools-70.1.1.tar.gz", hash = "sha256:937a48c7cdb7a21eb53cd7f9b59e525503aa8abaf3584c730dc5f7a5bec3a650"}, + {file = "setuptools-70.2.0-py3-none-any.whl", hash = "sha256:b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05"}, + {file = "setuptools-70.2.0.tar.gz", hash = "sha256:bd63e505105011b25c3c11f753f7e3b8465ea739efddaccef8f0efac2137bac1"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -5969,13 +6308,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.42.1" +version = "4.42.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" files = [ - {file = "transformers-4.42.1-py3-none-any.whl", hash = "sha256:d7392acf1e35a108e8abd2e3ea8f6ffc1d34dcb6c0275d6297ec337ae5de99b6"}, - {file = "transformers-4.42.1.tar.gz", hash = "sha256:89adfb6b6634f684a85bae1d53cc243a43e30479392b3c873be743af61556f4f"}, + {file = "transformers-4.42.3-py3-none-any.whl", hash = "sha256:a61a0df9609b7d69229d941b2fd857c841ba3043d6da503d0da1a4b133f65b92"}, + {file = "transformers-4.42.3.tar.gz", hash = "sha256:7539873ff45809145265cbc94ea4619d2713c41ceaa277b692d8b0be3430f7eb"}, ] [package.dependencies] @@ -6548,4 +6887,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.11, <4.0" -content-hash = "5b47db3b930415e4dd1d709bb047f78b70c67dc49ff6ea291c198ff368eaf327" +content-hash = "91393e3f434457dd547937bf08aa22746cbd99ca66c940e9bc57ac71178b0432" diff --git a/pyproject.toml b/pyproject.toml index 4ebeabb8..e21433ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,19 +5,17 @@ version = "0.1.0" description = "A project to develop and test the lightrag library" authors = ["Your Name "] license = "MIT" - -packages = [ - { include = "lightrag", from = "." } -] +packages = [{ from = "_lightrag", include = "lightrag" }] # empty packages list [tool.poetry.dependencies] python = ">=3.11, <4.0" -lightrag = { path = "./lightrag", develop = true } +lightrag = { path = "lightrag", develop = true } torch = "^2.3.1" flagembedding = "^1.2.10" # cohere = "^5.5.7" openai = "^1.34.0" -pgvector = "^0.2.5" +networkx = "^3.3" +pyvis = "^0.3.2" [tool.poetry.group.dev.dependencies] @@ -50,6 +48,9 @@ cohere = "^5.5.8" langchain = "^0.2.5" langchain-community = "^0.2.5" langchain-openai = "^0.1.8" +pydot = "^2.0.0" +matplotlib = "^3.9.0" +pyvis = "^0.3.2" [tool.poetry.group.doc.dependencies] @@ -60,8 +61,9 @@ sphinx = "^7.3.7" nbsphinx = "^0.9.4" nbconvert = "^7.16.4" pandoc = "^2.3" +readthedocs-sphinx-search = "^0.3.2" [build-system] requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" diff --git a/test_pyvis.py b/test_pyvis.py new file mode 100644 index 00000000..39f72639 --- /dev/null +++ b/test_pyvis.py @@ -0,0 +1,15 @@ +from pyvis.network import Network +import networkx as nx + +# Create a simple NetworkX graph +G = nx.DiGraph() +G.add_edges_from([(1, 2), (2, 3), (3, 4)]) + +# Create a Pyvis Network +net = Network(notebook=False, width="100%", height="100%", directed=True) + +# Add nodes and edges from NetworkX to Pyvis +net.from_nx(G) + +# Save the network to an HTML file +net.show("test_network.html") diff --git a/use_cases/__init__.py b/use_cases/__init__.py index e69de29b..d33bab7c 100644 --- a/use_cases/__init__.py +++ b/use_cases/__init__.py @@ -0,0 +1,3 @@ +from lightrag.utils import setup_env + +setup_env() diff --git a/use_cases/classification/__init__.py b/use_cases/classification/__init__.py new file mode 100644 index 00000000..d33bab7c --- /dev/null +++ b/use_cases/classification/__init__.py @@ -0,0 +1,3 @@ +from lightrag.utils import setup_env + +setup_env() diff --git a/use_cases/classification/task.py b/use_cases/classification/task.py index 8d9d9760..79e5754e 100644 --- a/use_cases/classification/task.py +++ b/use_cases/classification/task.py @@ -1,23 +1,17 @@ from typing import Dict, Any -from dataclasses import dataclass, field +from dataclasses import field import os -from lightrag.utils import setup_env import re -import logging from lightrag.core.component import Component, Sequential, fun_to_component -from lightrag.core.generator import Generator, GeneratorOutput +from lightrag.core.generator import Generator from lightrag.components.model_client import ( GroqAPIClient, - OpenAIClient, - GoogleGenAIClient, - AnthropicAPIClient, ) from lightrag.core.prompt_builder import Prompt from lightrag.components.output_parsers import YamlOutputParser -from lightrag.core.string_parser import JsonParser from lightrag.tracing import trace_generator_states, trace_generator_call @@ -28,7 +22,6 @@ from lightrag.core.base_data_class import DataClass -from use_cases.classification.data import _COARSE_LABELS_DESC, _COARSE_LABELS from use_cases.classification.utils import get_script_dir from use_cases.classification.config_log import log @@ -100,6 +93,27 @@ def get_tracing_path(): return os.path.join(get_script_dir(), "traces") +openai_model_kwargs = { + "model": "gpt-3.5-turbo", + "temperature": 0.0, + "top_p": 1, + "frequency_penalty": 0, + "presence_penalty": 0, + "n": 1, +} # noqa: F841 +google_model_kwargs = { + "model": "gemini-1.5-pro-latest", + "temperature": 0.0, + "top_p": 1, +} # noqa: F841 +anthropic_model_kwargs = { + "model": "claude-3-opus-20240229", + "temperature": 0.0, + "top_p": 1, + "max_tokens": 1024, +} # noqa: F841 + + @trace_generator_states(save_dir=get_tracing_path()) @trace_generator_call(save_dir=get_tracing_path(), error_only=True) class TRECClassifier(Component): @@ -154,25 +168,6 @@ def __init__( "presence_penalty": 0, "n": 1, } - openai_model_kwargs = { - "model": "gpt-3.5-turbo", - "temperature": 0.0, - "top_p": 1, - "frequency_penalty": 0, - "presence_penalty": 0, - "n": 1, - } - google_model_kwargs = { - "model": "gemini-1.5-pro-latest", - "temperature": 0.0, - "top_p": 1, - } - anthropic_model_kwargs = { - "model": "claude-3-opus-20240229", - "temperature": 0.0, - "top_p": 1, - "max_tokens": 1024, - } @fun_to_component def format_class_label(x: Dict[str, Any]) -> int: diff --git a/use_cases/llm_as_retriever.py b/use_cases/llm_as_retriever.py index 45145088..81a44229 100644 --- a/use_cases/llm_as_retriever.py +++ b/use_cases/llm_as_retriever.py @@ -3,8 +3,6 @@ from lightrag.components.retriever import LLMRetriever from lightrag.components.model_client import OpenAIClient -import utils.setup_env - def test_llm_retriever(): # TODO: directly pass Generator class is more intuitive than the generator_kwargs diff --git a/use_cases/rag_hotpotqa.py b/use_cases/rag_hotpotqa.py index 1ef6a036..3f651a5d 100644 --- a/use_cases/rag_hotpotqa.py +++ b/use_cases/rag_hotpotqa.py @@ -8,15 +8,13 @@ from lightrag.core.types import Document from lightrag.core.string_parser import JsonParser -from lightrag.core.component import Sequential, Component +from lightrag.core.component import Component from lightrag.eval import ( RetrieverRecall, RetrieverRelevance, AnswerMatchAcc, LLMasJudge, - DEFAULT_LLM_EVALUATOR_PROMPT, ) -from lightrag.core.prompt_builder import Prompt from use_cases.rag import RAG diff --git a/use_cases/simple_qa_anthropic.py b/use_cases/simple_qa_anthropic.py index 2c961d14..63273354 100644 --- a/use_cases/simple_qa_anthropic.py +++ b/use_cases/simple_qa_anthropic.py @@ -7,8 +7,6 @@ from lightrag.components.model_client import AnthropicAPIClient -import lightrag.utils.setup_env - class SimpleQA(Component): def __init__(self): diff --git a/use_cases/simple_qa_google.py b/use_cases/simple_qa_google.py index bc87080c..c672f079 100644 --- a/use_cases/simple_qa_google.py +++ b/use_cases/simple_qa_google.py @@ -7,8 +7,6 @@ from lightrag.components.model_client import GoogleGenAIClient -import utils.setup_env - class SimpleQA(Component): def __init__(self): diff --git a/use_cases/simple_qa_groq.py b/use_cases/simple_qa_groq.py index 31fe3120..ce38fa81 100644 --- a/use_cases/simple_qa_groq.py +++ b/use_cases/simple_qa_groq.py @@ -3,8 +3,6 @@ from lightrag.components.model_client import GroqAPIClient -import utils.setup_env - class SimpleQA(Component): def __init__(self): diff --git a/use_cases/simple_qa_memory.py b/use_cases/simple_qa_memory.py index 3cd2ca75..ee9464c4 100644 --- a/use_cases/simple_qa_memory.py +++ b/use_cases/simple_qa_memory.py @@ -2,60 +2,56 @@ We just need to very basic generator that can be used to generate text from a prompt. """ -from lightrag.core.generator import Generator -from lightrag.core.component import Component -from lightrag.core.memory import Memory - -from lightrag.components.model_client import OpenAIClient - - -import utils.setup_env - - -class SimpleDialog(Component): - def __init__(self): - super().__init__() - model_kwargs = {"model": "gpt-3.5-turbo"} - task_desc_str = "You are a helpful assistant." - self.generator = Generator( - model_client=OpenAIClient(), - model_kwargs=model_kwargs, - preset_prompt_kwargs={"task_desc_str": task_desc_str}, - ) - self.chat_history = Memory() - self.generator.print_prompt() - - def chat(self) -> str: - print("Welcome to SimpleQA. You can ask any question. Type 'exit' to end.") - while True: - user_input = input("You: ") - # - if user_input.lower() == "exit": - print("Goodbye!") - break - chat_history_str = self.chat_history() - response = self.generator( - prompt_kwargs={ - "chat_history_str": chat_history_str, - "input": user_input, - }, - ) - # save the user input and response to the memory - self.chat_history.add_dialog_turn( - user_query=user_input, assistant_response=response - ) - """ - Most components mush have a __call__ method in order to be chained together with other component in the data pipeline. - From the memory management, it is difficult to just chain them together. - This is similar to the retrieval. This additional step is to manage the exteral db like - data injection. Retrieving can be chained such as we use self.chat_history() to get the chat history. - """ - print(f"Assistant: {response}") - - # a class to have a multiple turns and take user input - - -if __name__ == "__main__": - simple_qa = SimpleDialog() - print(simple_qa) - print(simple_qa.chat()) +# from lightrag.core.component import Component +# from lightrag.core.memory import Memory + +# from lightrag.components.model_client import OpenAIClient + + +# class SimpleDialog(Component): +# def __init__(self): +# super().__init__() +# model_kwargs = {"model": "gpt-3.5-turbo"} +# task_desc_str = "You are a helpful assistant." +# self.generator = Generator( +# model_client=OpenAIClient(), +# model_kwargs=model_kwargs, +# preset_prompt_kwargs={"task_desc_str": task_desc_str}, +# ) +# self.chat_history = Memory() +# self.generator.print_prompt() + +# def chat(self) -> str: +# print("Welcome to SimpleQA. You can ask any question. Type 'exit' to end.") +# while True: +# user_input = input("You: ") +# # +# if user_input.lower() == "exit": +# print("Goodbye!") +# break +# chat_history_str = self.chat_history() +# response = self.generator( +# prompt_kwargs={ +# "chat_history_str": chat_history_str, +# "input": user_input, +# }, +# ) +# # save the user input and response to the memory +# self.chat_history.add_dialog_turn( +# user_query=user_input, assistant_response=response +# ) +# """ +# Most components mush have a __call__ method in order to be chained together with other component in the data pipeline. +# From the memory management, it is difficult to just chain them together. +# This is similar to the retrieval. This additional step is to manage the exteral db like +# data injection. Retrieving can be chained such as we use self.chat_history() to get the chat history. +# """ +# print(f"Assistant: {response}") + +# # a class to have a multiple turns and take user input + + +# if __name__ == "__main__": +# simple_qa = SimpleDialog() +# print(simple_qa) +# print(simple_qa.chat()) diff --git a/use_cases/simple_qa_trainable.py b/use_cases/simple_qa_trainable.py index 7e4523e7..e44f805c 100644 --- a/use_cases/simple_qa_trainable.py +++ b/use_cases/simple_qa_trainable.py @@ -3,8 +3,6 @@ from lightrag.components.model_client import GroqAPIClient -import utils.setup_env - class SimpleQA(Component): def __init__(self): diff --git a/use_cases/simple_rag.py b/use_cases/simple_rag.py index c9bda189..e45eba17 100644 --- a/use_cases/simple_rag.py +++ b/use_cases/simple_rag.py @@ -17,7 +17,6 @@ ToEmbeddings, DocumentSplitter, ) -from lightrag.utils import setup_env # noqa # TODO: RAG can potentially be a component itsefl and be provided to the users diff --git a/use_cases/simple_rag_bm_25.py b/use_cases/simple_rag_bm_25.py index ef141e7f..ba692b1d 100644 --- a/use_cases/simple_rag_bm_25.py +++ b/use_cases/simple_rag_bm_25.py @@ -1,19 +1,17 @@ from typing import Any, List, Optional -from core.generator import Generator -from core.data_components import ( +from lightrag.core.generator import Generator +from lightrag.components.data_process.data_components import ( RetrieverOutputToContextStr, ) -from core.string_parser import JsonParser -from core.component import Component, Sequential -from core.db import LocalDB -from core.types import Document +from lightrag.core.string_parser import JsonParser +from lightrag.core.component import Component, Sequential +from lightrag.core.db import LocalDB +from lightrag.core.types import Document -from components.retriever import InMemoryBM25Retriever -from components.model_client import OpenAIClient - -import utils.setup_env # noqa +from lightrag.components.retriever import BM25Retriever +from lightrag.components.model_client import OpenAIClient # TODO: RAG can potentially be a component itsefl and be provided to the users @@ -31,7 +29,7 @@ def __init__(self): "stream": False, } - self.retriever = InMemoryBM25Retriever( + self.retriever = BM25Retriever( top_k=self.retriever_settings["top_k"], ) self.retriever_output_processors = RetrieverOutputToContextStr(deduplicate=True) diff --git a/use_cases/use_embedder.py b/use_cases/use_embedder.py index 1498ce6f..04600e72 100644 --- a/use_cases/use_embedder.py +++ b/use_cases/use_embedder.py @@ -5,8 +5,6 @@ from lightrag.core.component import Component from lightrag.components.model_client import OpenAIClient -import utils.setup_env - class SimpleEmbedder(Component): """ @@ -50,6 +48,7 @@ async def main(): start_time = time.time() results = await asyncio.gather(*tasks) + print(results) end_time = time.time() print(f"Total time for 10 async calls: {end_time - start_time} seconds") diff --git a/use_cases/yaml_output.py b/use_cases/yaml_output.py index 65a3b1e6..0a26367c 100644 --- a/use_cases/yaml_output.py +++ b/use_cases/yaml_output.py @@ -1,15 +1,15 @@ from lightrag.core.component import Component from lightrag.core.generator import Generator -from lightrag.components.model_client import GroqAPIClient, OpenAIClient -from lightrag.components.output_parsers import YamlOutputParser, ListOutputParser +from lightrag.components.model_client import GroqAPIClient +from lightrag.components.output_parsers import YamlOutputParser +from dataclasses import dataclass from lightrag.core.base_data_class import DataClass, field from lightrag.core.types import GeneratorOutput -from lightrag.utils import setup_env - +@dataclass class JokeOutput(DataClass): setup: str = field(metadata={"desc": "question to set up a joke"}, default="") punchline: str = field(metadata={"desc": "answer to resolve the joke"}, default="") diff --git a/visualize.py b/visualize.py new file mode 100644 index 00000000..971d6e65 --- /dev/null +++ b/visualize.py @@ -0,0 +1,218 @@ +import os +import ast +import pydot +import networkx as nx +from pyvis.network import Network + + +def get_class_name(node): + if isinstance(node, ast.Name): + return node.id + elif isinstance(node, ast.Attribute): + return node.attr + elif isinstance(node, ast.Subscript): + return get_class_name(node.value) + elif isinstance(node, ast.Call): + return get_class_name(node.func) + return None + + +def get_classes_from_file(file_path): + try: + with open(file_path, "r") as file: + tree = ast.parse(file.read(), filename=file_path) + + classes = [node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)] + except Exception as e: + print(f"Error processing {file_path}: {e}") + classes = [] + print(f"Found {len(classes)} classes in {file_path}") + return classes + + +def get_class_hierarchy_nx(classes): + graph = nx.DiGraph() + excluded_bases = { + "ComponentWithSuperInit", + "ComponentMissSuperInit", + "Memory", + "ObjectTypes", + } + + for cls in classes: + for base in cls.bases: + try: + base_name = get_class_name(base) + if base_name not in excluded_bases and cls.name not in excluded_bases: + print(f"Adding edge from {base_name} to {cls.name}") + graph.add_edge(base_name, cls.name) + except AttributeError as e: + print(f"Error processing {cls.name}: {base}, {e}") + pass + + return graph + + +def visualize_class_hierarchy_2(graph, filename="class_hierarchy.html"): + # Create a Pyvis Network + # Create a Pyvis Network + filename = filename.replace(".png", ".html") + print(filename) + net = Network(notebook=False, width="100%", height="100%", directed=True) + + # Add nodes and edges to the Pyvis network from the NetworkX graph + for node1, node2 in graph.edges(): + net.add_node(node1) + net.add_node(node2) + net.add_edge(node1, node2) + print("Nodes and edges added to the Pyvis network") + print(net) + + # Show graph + net.show(filename, local=True) + print(f"Class hierarchy saved as {filename}") + + +def save_edges_to_file(graph, filename="class_hierarchy_edges.csv"): + with open(filename, "w") as file: + for node1, node2 in graph.edges(): + file.write(f"{node1},{node2}\n") + print(f"Edges saved to {filename}") + + +def get_class_hierarchy(classes): + graph = pydot.Dot(graph_type="digraph") + excluded_bases = { + "ComponentWithSuperInit", + "ComponentMissSuperInit", + "Memory", + "ObjectTypes", + } + + for cls in classes: + for base in cls.bases: + try: + base_name = get_class_name(base) + if base_name not in excluded_bases and cls.name not in excluded_bases: + print(f"Adding edge from {base_name} to {cls.name}") + graph.add_edge(pydot.Edge(base_name, cls.name)) + except AttributeError as e: + print(f"Error processing {cls.name}: {base}, {e}") + # base_name = base.value + pass + + return graph + + +# def visualize_class_hierarchy_nx(graph, filename="class_hierarchy.png"): + +# filename = filename.replace(".png", ".html") + +# pos = nx.spring_layout(graph) # You can choose other layouts as needed + +# plt.figure(figsize=(10, 8)) +# nx.draw( +# graph, +# pos, +# with_labels=True, +# node_size=1500, +# node_color="skyblue", +# font_size=10, +# font_color="black", +# font_weight="bold", +# edge_color="#666666", +# linewidths=2, +# arrows=True, +# arrowstyle="->", +# arrowsize=10, +# ) + +# plt.title("LightRAG Class Hierarchy") +# plt.tight_layout() + +# # Convert to HTML using mpld3 +# html_str = mpld3.fig_to_html(plt.gcf()) + +# with open(filename, "w") as file: +# file.write(html_str) + +# print(f"Class hierarchy saved as {filename}") + + +def visualize_class_hierarchy(graph, filename="class_hierarchy.png"): + dpi = 800 + graph.set_graph_defaults(dpi=str(dpi)) + + graph.write_png(filename, prog="dot") + graph.write_dot(filename.replace(".png", ".dot")) + graph.write_svg(filename.replace(".png", ".svg")) + print(f"Class hierarchy saved as {filename}") + + +def process_directory(directory): + all_classes = [] + for root, _, files in os.walk(directory): + for file in files: + if file.endswith(".py"): + file_path = os.path.join(root, file) + classes = get_classes_from_file(file_path) + all_classes.extend(classes) + + return all_classes + + +# Directory containing your Python files +def light_rag_paths(): + Light_rag_directory = "/Users/liyin/Documents/test/LightRAG/lightrag" + + paths = ["core", "components", "utils", "eval", "optim", "tracing"] + for path in paths: + yield os.path.join(Light_rag_directory, path) + + global graph_name + graph_name = "lightrag_class_hierarchy.png" + + +def llama_index_paths(): + Llama_index_directory = "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.11/site-packages/llama_index" + paths = ["core"] + for path in paths: + yield os.path.join(Llama_index_directory, path) + yield Llama_index_directory + + global graph_name + graph_name = "llama_index_class_hierarchy.png" + + +def lang_chain_paths(): + Lang_chain_directory = "/Users/liyin/Documents/test/LightRAG/.venv/lib/python3.11/site-packages/langchain" + # paths = ["core"] + # for path in paths: + # yield os.path.join(Lang_chain_directory, path) + yield Lang_chain_directory + + global graph_name + graph_name = "lang_chain_class_hierarchy.png" + + +if __name__ == "__main__": + paths = light_rag_paths() + # paths = llama_index_paths() + # paths = lang_chain_paths() + + # Get all classes from the directory + all_classes = [] + for path in paths: + all_classes.extend(process_directory(path)) + + # Generate the class hierarchy graph + # graph = get_class_hierarchy(all_classes) + + # use nx + graph = get_class_hierarchy_nx(all_classes) + + # Visualize the class hierarchy + # visualize_class_hierarchy(graph, filename=graph_name) + + # visualize_class_hierarchy_2(graph, filename=graph_name) + save_edges_to_file(graph, filename="class_hierarchy_edges.csv")