diff --git a/embeddings-and-vector-databases-with-chromadb/README.md b/embeddings-and-vector-databases-with-chromadb/README.md new file mode 100644 index 0000000000..429fc348dc --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/README.md @@ -0,0 +1,11 @@ +# Embeddings and Vector Databases With ChromaDB + +Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/embeddings-and-vector-databases-with-chromadb/). + +To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment. + +You can install the dependencies manually, or by running: + +``` +(venv) $ python -m pip install -r requirements.txt +``` diff --git a/embeddings-and-vector-databases-with-chromadb/car_data_etl.py b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py new file mode 100644 index 0000000000..1026bc9f2d --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py @@ -0,0 +1,62 @@ +import pathlib + +import polars as pl + + +def prepare_car_reviews_data( + data_path: pathlib.Path, vehicle_years: list[int] = [2017] +): + """Prepare the car reviews dataset for ChromaDB""" + + # Define the schema to ensure proper data types are enforced + dtypes = { + "": pl.Int64, + "Review_Date": pl.Utf8, + "Author_Name": pl.Utf8, + "Vehicle_Title": pl.Utf8, + "Review_Title": pl.Utf8, + "Review": pl.Utf8, + "Rating": pl.Float64, + } + + # Scan the car reviews dataset(s) + car_reviews = pl.scan_csv(data_path, dtypes=dtypes) + + # Extract the vehicle title and year as new columns + # Filter on selected years + car_review_db_data = ( + car_reviews.with_columns( + [ + ( + pl.col("Vehicle_Title") + .str.split(by=" ") + .list.get(0) + .cast(pl.Int64) + ).alias("Vehicle_Year"), + (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias( + "Vehicle_Model" + ), + ] + ) + .filter(pl.col("Vehicle_Year").is_in(vehicle_years)) + .select( + [ + "Review_Title", + "Review", + "Rating", + "Vehicle_Year", + "Vehicle_Model", + ] + ) + .sort(["Vehicle_Model", "Rating"]) + .collect() + ) + + # Create ids, documents, and metadatas data in the format chromadb expects + ids = [f"review{i}" for i in range(car_review_db_data.shape[0])] + documents = car_review_db_data["Review"].to_list() + metadatas = car_review_db_data.drop("Review").to_dicts() + + chroma_data = {"ids": ids, "documents": documents, "metadatas": metadatas} + + return chroma_data diff --git a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py new file mode 100644 index 0000000000..b45f12c934 --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py @@ -0,0 +1,41 @@ +import pathlib + +import chromadb +from chromadb.utils import embedding_functions +from more_itertools import batched + + +def build_chroma_collection( + chroma_path: pathlib.Path, + collection_name: str, + embbeding_func_name: str, + ids: list[str], + documents: list[str], + metadatas: list[dict], + distance_func_name: str = "cosine", +): + """Create a ChromaDB collection""" + + chroma_client = chromadb.PersistentClient(chroma_path) + + embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=embbeding_func_name + ) + + collection = chroma_client.create_collection( + name=collection_name, + embedding_function=embedding_func, + metadata={"hnsw:space": distance_func_name}, + ) + + document_indices = list(range(len(documents))) + + for batch in batched(document_indices, 166): + start_idx = batch[0] + end_idx = batch[-1] + + collection.add( + ids=ids[start_idx:end_idx], + documents=documents[start_idx:end_idx], + metadatas=metadatas[start_idx:end_idx], + ) diff --git a/embeddings-and-vector-databases-with-chromadb/config.json b/embeddings-and-vector-databases-with-chromadb/config.json new file mode 100644 index 0000000000..a395e21fcf --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/config.json @@ -0,0 +1,3 @@ +{ + "openai-secret-key": "your-api-key" +} \ No newline at end of file diff --git a/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py b/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py new file mode 100644 index 0000000000..cb3a416edd --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py @@ -0,0 +1,7 @@ +import numpy as np + + +def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float: + """Compute the cosine similarity between two vectors""" + + return u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v)) diff --git a/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py b/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py new file mode 100644 index 0000000000..929c9a2522 --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py @@ -0,0 +1,39 @@ +import chromadb +from chromadb.utils import embedding_functions + +from car_data_etl import prepare_car_reviews_data +from chroma_utils import build_chroma_collection + +DATA_PATH = "data/archive/*" +CHROMA_PATH = "car_review_embeddings" +EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1" +COLLECTION_NAME = "car_reviews" + +chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH) + +build_chroma_collection( + CHROMA_PATH, + COLLECTION_NAME, + EMBEDDING_FUNC_NAME, + chroma_car_reviews_dict["ids"], + chroma_car_reviews_dict["documents"], + chroma_car_reviews_dict["metadatas"], +) + +client = chromadb.PersistentClient(CHROMA_PATH) +embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=EMBEDDING_FUNC_NAME +) +collection = client.get_collection( + name=COLLECTION_NAME, embedding_function=embedding_func +) + +great_reviews = collection.query( + query_texts=[ + "Find me some positive reviews that discuss the car's performance" + ], + n_results=5, + include=["documents", "distances", "metadatas"], +) + +print(great_reviews["documents"][0][0]) diff --git a/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py b/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py new file mode 100644 index 0000000000..2659a23aeb --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py @@ -0,0 +1,23 @@ +import numpy as np + +# Create vectors with NumPy +vector1 = np.array([1, 0]) +vector2 = np.array([0, 1]) +print(vector1) +print(vector2) + +v1 = np.array([1, 0]) +v2 = np.array([0, 1]) +v3 = np.array([np.sqrt(2), np.sqrt(2)]) + +# Dimension +print(v1.shape) + +# Magnitude +print(np.sqrt(np.sum(v1**2))) +print(np.linalg.norm(v1)) +print(np.linalg.norm(v3)) + +# Dot product +print(np.sum(v1 * v2)) +print(v1.dot(v3)) diff --git a/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py b/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py new file mode 100644 index 0000000000..cc9bff4112 --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py @@ -0,0 +1,114 @@ +import json +import os + +import chromadb +import openai +from chromadb.utils import embedding_functions + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +DATA_PATH = "data/archive/*" +CHROMA_PATH = "car_review_embeddings" +EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1" +COLLECTION_NAME = "car_reviews" + +with open("config.json", "r") as json_file: + config_data = json.load(json_file) + +openai.api_key = config_data.get("openai-secret-key") + +client = chromadb.PersistentClient(CHROMA_PATH) +embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=EMBEDDING_FUNC_NAME +) + +collection = client.get_collection( + name=COLLECTION_NAME, embedding_function=embedding_func +) + +context = """ + You are a customer success employee at a large + car dealership. Use the following car reviews + to answer questions: {} + """ + +question = """ + What's the key to great customer satisfaction + based on detailed positive reviews? + """ + +good_reviews = collection.query( + query_texts=[question], + n_results=10, + include=["documents"], + where={"Rating": {"$gte": 3}}, +) + +reviews_str = ",".join(good_reviews["documents"][0]) + +good_review_summaries = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +reviews_str = ",".join(good_reviews["documents"][0]) + +print("Good reviews: ") +print(reviews_str) +print("###########################################") + +good_review_summaries = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +print("AI-Generated summary of good reviews: ") +print(good_review_summaries["choices"][0]["message"]["content"]) +print("###########################################") + + +context = """ + You are a customer success employee at a large car dealership. + Use the following car reivews to answer questions: {} + """ +question = """ + Which of these poor reviews has the worst implications about + our dealership? Explain why. + """ + +poor_reviews = collection.query( + query_texts=[question], + n_results=5, + include=["documents"], + where={"Rating": {"$lte": 3}}, +) + +reviews_str = ",".join(poor_reviews["documents"][0]) + +print("Worst reviews: ") +print(poor_reviews["documents"][0][0]) +print("###########################################") + +poor_review_analysis = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +print("AI-Generated summary of the single worst review: ") +print(poor_review_analysis["choices"][0]["message"]["content"]) +print("###########################################") diff --git a/embeddings-and-vector-databases-with-chromadb/requirements.txt b/embeddings-and-vector-databases-with-chromadb/requirements.txt new file mode 100644 index 0000000000..48d2e1ed1c --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/requirements.txt @@ -0,0 +1,92 @@ +aiohttp==3.8.6 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==3.7.1 +async-timeout==4.0.3 +attrs==23.1.0 +backoff==2.2.1 +bcrypt==4.0.1 +blis==0.7.11 +catalogue==2.0.10 +certifi==2023.7.22 +charset-normalizer==3.3.0 +chroma-hnswlib==0.7.3 +chromadb==0.4.14 +click==8.1.7 +cloudpathlib==0.16.0 +coloredlogs==15.0.1 +confection==0.1.3 +cymem==2.0.8 +fastapi==0.104.0 +filelock==3.12.4 +flatbuffers==23.5.26 +frozenlist==1.4.0 +fsspec==2023.9.2 +grpcio==1.59.0 +h11==0.14.0 +httptools==0.6.1 +huggingface-hub==0.17.3 +humanfriendly==10.0 +idna==3.4 +importlib-resources==6.1.0 +Jinja2==3.1.2 +joblib==1.3.2 +langcodes==3.3.0 +MarkupSafe==2.1.3 +monotonic==1.6 +more-itertools==10.1.0 +mpmath==1.3.0 +multidict==6.0.4 +murmurhash==1.0.10 +networkx==3.2 +nltk==3.8.1 +numpy==1.26.1 +onnxruntime==1.16.1 +openai==0.28.1 +overrides==7.4.0 +packaging==23.2 +Pillow==10.1.0 +polars==0.19.9 +posthog==3.0.2 +preshed==3.0.9 +protobuf==4.24.4 +pulsar-client==3.3.0 +pydantic==2.4.2 +pydantic_core==2.10.1 +PyPika==0.48.9 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +PyYAML==6.0.1 +regex==2023.10.3 +requests==2.31.0 +safetensors==0.4.0 +scikit-learn==1.3.1 +scipy==1.11.3 +sentence-transformers==2.2.2 +sentencepiece==0.1.99 +six==1.16.0 +smart-open==6.4.0 +sniffio==1.3.0 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 +starlette==0.27.0 +sympy==1.12 +thinc==8.2.1 +threadpoolctl==3.2.0 +tokenizers==0.14.1 +torch==2.1.0 +torchvision==0.16.0 +tqdm==4.66.1 +transformers==4.34.1 +typer==0.9.0 +typing_extensions==4.8.0 +urllib3==2.0.7 +uvicorn==0.23.2 +uvloop==0.18.0 +wasabi==1.1.2 +watchfiles==0.21.0 +weasel==0.3.3 +websockets==11.0.3 +yarl==1.9.2 diff --git a/embeddings-and-vector-databases-with-chromadb/text_embeddings.py b/embeddings-and-vector-databases-with-chromadb/text_embeddings.py new file mode 100644 index 0000000000..adce790e63 --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/text_embeddings.py @@ -0,0 +1,41 @@ +from sentence_transformers import SentenceTransformer + +from cosine_similarity import compute_cosine_similarity + +model = SentenceTransformer("all-MiniLM-L6-v2") +texts = [ + "The canine barked loudly.", + "The dog made a noisy bark.", + "He ate a lot of pizza.", + "He devoured a large quantity of pizza pie.", +] + +text_embeddings = model.encode(texts) + +print(type(text_embeddings)) + +print(text_embeddings.shape) + +text_embeddings_dict = dict(zip(texts, list(text_embeddings))) + +dog_text_1 = "The canine barked loudly." +dog_text_2 = "The dog made a noisy bark." +print( + compute_cosine_similarity( + text_embeddings_dict[dog_text_1], text_embeddings_dict[dog_text_2] + ) +) + +pizza_text_1 = "He ate a lot of pizza." +pizza_text_2 = "He devoured a large quantity of pizza pie." +print( + compute_cosine_similarity( + text_embeddings_dict[pizza_text_1], text_embeddings_dict[pizza_text_2] + ) +) + +print( + compute_cosine_similarity( + text_embeddings_dict[dog_text_1], text_embeddings_dict[pizza_text_1] + ) +) diff --git a/embeddings-and-vector-databases-with-chromadb/word_vectors.py b/embeddings-and-vector-databases-with-chromadb/word_vectors.py new file mode 100644 index 0000000000..ccaed6e0a5 --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/word_vectors.py @@ -0,0 +1,35 @@ +import spacy + +from cosine_similarity import compute_cosine_similarity + +# Load the medium-size English model +nlp = spacy.load("en_core_web_md") + +# Get the word vector for the word "dog" +dog_embedding = nlp.vocab["dog"].vector + +# Word vectors are stored as NumPy arrays +print(type(dog_embedding)) + +# Word vector dimension +print(dog_embedding.shape) + +# First 10 elements of the "dog" word vector +print(dog_embedding[0:10]) + +dog_embedding = nlp.vocab["dog"].vector +cat_embedding = nlp.vocab["cat"].vector +apple_embedding = nlp.vocab["apple"].vector +tasty_embedding = nlp.vocab["tasty"].vector +delicious_embedding = nlp.vocab["delicious"].vector +truck_embedding = nlp.vocab["truck"].vector + +print(compute_cosine_similarity(dog_embedding, cat_embedding)) + +print(compute_cosine_similarity(delicious_embedding, tasty_embedding)) + +print(compute_cosine_similarity(apple_embedding, delicious_embedding)) + +print(compute_cosine_similarity(dog_embedding, apple_embedding)) + +print(compute_cosine_similarity(truck_embedding, delicious_embedding))