From ed8d2b0c69ec17700a307376d2c52df292cabbea Mon Sep 17 00:00:00 2001 From: hfhoffman1144 Date: Thu, 19 Oct 2023 08:21:46 -0500 Subject: [PATCH 1/8] first materials commit for chromadb article --- embeddings_chromadb/README.md | 7 ++ embeddings_chromadb/car_data_etl.py | 61 ++++++++++ embeddings_chromadb/chroma_utils.py | 40 +++++++ embeddings_chromadb/config.json | 3 + embeddings_chromadb/cosine_similarity.py | 7 ++ .../create_car_review_collection.py | 38 ++++++ embeddings_chromadb/intro_to_vectors.py | 23 ++++ embeddings_chromadb/llm_car_review_context.py | 113 ++++++++++++++++++ embeddings_chromadb/text_embeddings.py | 40 +++++++ embeddings_chromadb/word_vectors.py | 34 ++++++ 10 files changed, 366 insertions(+) create mode 100644 embeddings_chromadb/README.md create mode 100644 embeddings_chromadb/car_data_etl.py create mode 100644 embeddings_chromadb/chroma_utils.py create mode 100644 embeddings_chromadb/config.json create mode 100644 embeddings_chromadb/cosine_similarity.py create mode 100644 embeddings_chromadb/create_car_review_collection.py create mode 100644 embeddings_chromadb/intro_to_vectors.py create mode 100644 embeddings_chromadb/llm_car_review_context.py create mode 100644 embeddings_chromadb/text_embeddings.py create mode 100644 embeddings_chromadb/word_vectors.py diff --git a/embeddings_chromadb/README.md b/embeddings_chromadb/README.md new file mode 100644 index 0000000000..6f116901b0 --- /dev/null +++ b/embeddings_chromadb/README.md @@ -0,0 +1,7 @@ +# Embeddings and Vector Databases With ChromaDB + +Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/embeddings-and-vector-databases-with-chromadb/). + +To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment. + + diff --git a/embeddings_chromadb/car_data_etl.py b/embeddings_chromadb/car_data_etl.py new file mode 100644 index 0000000000..b454a4e19b --- /dev/null +++ b/embeddings_chromadb/car_data_etl.py @@ -0,0 +1,61 @@ +import pathlib +import polars as pl + + +def prepare_car_reviews_data( + data_path: pathlib.Path, vehicle_years: list[int] = [2017] +): + """Prepare the car reviews dataset for ChromaDB""" + + # Define the schema to ensure proper data types are enforced + dtypes = { + "": pl.Int64, + "Review_Date": pl.Utf8, + "Author_Name": pl.Utf8, + "Vehicle_Title": pl.Utf8, + "Review_Title": pl.Utf8, + "Review": pl.Utf8, + "Rating": pl.Float64, + } + + # Scan the car reviews dataset(s) + car_reviews = pl.scan_csv(data_path, dtypes=dtypes) + + # Extract the vehicle title and year as new columns + # Filter on selected years + car_review_db_data = ( + car_reviews.with_columns( + [ + ( + pl.col("Vehicle_Title") + .str.split(by=" ") + .list.get(0) + .cast(pl.Int64) + ).alias("Vehicle_Year"), + (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias( + "Vehicle_Model" + ), + ] + ) + .filter(pl.col("Vehicle_Year").is_in(vehicle_years)) + .select( + [ + "Review_Title", + "Review", + "Rating", + "Vehicle_Year", + "Vehicle_Model", + ] + ) + .sort(["Vehicle_Model", "Rating"]) + .collect() + ) + + # Create ids, documents, and metadatas data in the format chromadb expects + ids = [f"review{i}" for i in range(car_review_db_data.shape[0])] + documents = car_review_db_data["Review"].to_list() + metadatas = car_review_db_data.drop("Review").to_dicts() + + chroma_data = {"ids": ids, "documents": documents, "metadatas": metadatas} + + return chroma_data diff --git a/embeddings_chromadb/chroma_utils.py b/embeddings_chromadb/chroma_utils.py new file mode 100644 index 0000000000..e1afd386c6 --- /dev/null +++ b/embeddings_chromadb/chroma_utils.py @@ -0,0 +1,40 @@ +import pathlib +import chromadb +from chromadb.utils import embedding_functions +from more_itertools import batched + + +def build_chroma_collection( + chroma_path: pathlib.Path, + collection_name: str, + embbeding_func_name: str, + ids: list[str], + documents: list[str], + metadatas: list[dict], + distance_func_name: str = "cosine", +): + """Create a ChromaDB collection""" + + chroma_client = chromadb.PersistentClient(chroma_path) + + embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=embbeding_func_name + ) + + collection = chroma_client.create_collection( + name=collection_name, + embedding_function=embedding_func, + metadata={"hnsw:space": distance_func_name}, + ) + + document_indicies = list(range(len(documents))) + + for batch in batched(document_indicies, 166): + start_idx = batch[0] + end_idx = batch[-1] + + collection.add( + ids=ids[start_idx:end_idx], + documents=documents[start_idx:end_idx], + metadatas=metadatas[start_idx:end_idx], + ) diff --git a/embeddings_chromadb/config.json b/embeddings_chromadb/config.json new file mode 100644 index 0000000000..a395e21fcf --- /dev/null +++ b/embeddings_chromadb/config.json @@ -0,0 +1,3 @@ +{ + "openai-secret-key": "your-api-key" +} \ No newline at end of file diff --git a/embeddings_chromadb/cosine_similarity.py b/embeddings_chromadb/cosine_similarity.py new file mode 100644 index 0000000000..cb3a416edd --- /dev/null +++ b/embeddings_chromadb/cosine_similarity.py @@ -0,0 +1,7 @@ +import numpy as np + + +def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float: + """Compute the cosine similarity between two vectors""" + + return u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v)) diff --git a/embeddings_chromadb/create_car_review_collection.py b/embeddings_chromadb/create_car_review_collection.py new file mode 100644 index 0000000000..1e7c4643fb --- /dev/null +++ b/embeddings_chromadb/create_car_review_collection.py @@ -0,0 +1,38 @@ +import chromadb +from chromadb.utils import embedding_functions +from car_data_etl import prepare_car_reviews_data +from chroma_utils import build_chroma_collection + +DATA_PATH = "data/archive/*" +CHROMA_PATH = "car_review_embeddings" +EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1" +COLLECTION_NAME = "car_reviews" + +chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH) + +build_chroma_collection( + CHROMA_PATH, + COLLECTION_NAME, + EMBEDDING_FUNC_NAME, + chroma_car_reviews_dict["ids"], + chroma_car_reviews_dict["documents"], + chroma_car_reviews_dict["metadatas"], +) + +client = chromadb.PersistentClient(CHROMA_PATH) +embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=EMBEDDING_FUNC_NAME +) +collection = client.get_collection( + name=COLLECTION_NAME, embedding_function=embedding_func +) + +great_reviews = collection.query( + query_texts=[ + "Find me some positive reviews that discuss the car's performance" + ], + n_results=5, + include=["documents", "distances", "metadatas"], +) + +print(great_reviews["documents"][0][0]) diff --git a/embeddings_chromadb/intro_to_vectors.py b/embeddings_chromadb/intro_to_vectors.py new file mode 100644 index 0000000000..2659a23aeb --- /dev/null +++ b/embeddings_chromadb/intro_to_vectors.py @@ -0,0 +1,23 @@ +import numpy as np + +# Create vectors with NumPy +vector1 = np.array([1, 0]) +vector2 = np.array([0, 1]) +print(vector1) +print(vector2) + +v1 = np.array([1, 0]) +v2 = np.array([0, 1]) +v3 = np.array([np.sqrt(2), np.sqrt(2)]) + +# Dimension +print(v1.shape) + +# Magnitude +print(np.sqrt(np.sum(v1**2))) +print(np.linalg.norm(v1)) +print(np.linalg.norm(v3)) + +# Dot product +print(np.sum(v1 * v2)) +print(v1.dot(v3)) diff --git a/embeddings_chromadb/llm_car_review_context.py b/embeddings_chromadb/llm_car_review_context.py new file mode 100644 index 0000000000..f648a9940e --- /dev/null +++ b/embeddings_chromadb/llm_car_review_context.py @@ -0,0 +1,113 @@ +import os +import json +import openai +import chromadb +from chromadb.utils import embedding_functions + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +DATA_PATH = "data/archive/*" +CHROMA_PATH = "car_review_embeddings" +EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1" +COLLECTION_NAME = "car_reviews" + +with open("config.json", "r") as json_file: + config_data = json.load(json_file) + +openai.api_key = config_data.get("openai-secret-key") + +client = chromadb.PersistentClient(CHROMA_PATH) +embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=EMBEDDING_FUNC_NAME +) + +collection = client.get_collection( + name=COLLECTION_NAME, embedding_function=embedding_func +) + +context = """ + You are a customer success employee at a large + car dealership. Use the following car reviews + to answer questions: {} + """ + +question = """ + What's the key to great customer satisfaction + based on detailed positive reviews? + """ + +good_reviews = collection.query( + query_texts=[question], + n_results=10, + include=["documents"], + where={"Rating": {"$gte": 3}}, +) + +reviews_str = ",".join(good_reviews["documents"][0]) + +good_review_summaries = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +reviews_str = ",".join(good_reviews["documents"][0]) + +print("Good reviews: ") +print(reviews_str) +print("###########################################") + +good_review_summaries = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +print("AI-Generated summary of good reviews: ") +print(good_review_summaries["choices"][0]["message"]["content"]) +print("###########################################") + + +context = """ + You are a customer success employee at a large car dealership. + Use the following car reivews to answer questions: {} + """ +question = """ + Which of these poor reviews has the worst implications about + our dealership? Explain why. + """ + +poor_reviews = collection.query( + query_texts=[question], + n_results=5, + include=["documents"], + where={"Rating": {"$lte": 3}}, +) + +reviews_str = ",".join(poor_reviews["documents"][0]) + +print("Worst reviews: ") +print(poor_reviews["documents"][0][0]) +print("###########################################") + +poor_review_analysis = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": context.format(reviews_str)}, + {"role": "user", "content": question}, + ], + temperature=0, + n=1, +) + +print("AI-Generated summary of the single worst review: ") +print(poor_review_analysis["choices"][0]["message"]["content"]) +print("###########################################") diff --git a/embeddings_chromadb/text_embeddings.py b/embeddings_chromadb/text_embeddings.py new file mode 100644 index 0000000000..073416f357 --- /dev/null +++ b/embeddings_chromadb/text_embeddings.py @@ -0,0 +1,40 @@ +from sentence_transformers import SentenceTransformer +from cosine_similarity import compute_cosine_similarity + +model = SentenceTransformer("all-MiniLM-L6-v2") +texts = [ + "The canine barked loudly.", + "The dog made a noisy bark.", + "He ate a lot of pizza.", + "He devoured a large quantity of pizza pie.", +] + +text_embeddings = model.encode(texts) + +print(type(text_embeddings)) + +print(text_embeddings.shape) + +text_embeddings_dict = dict(zip(texts, list(text_embeddings))) + +dog_text_1 = "The canine barked loudly." +dog_text_2 = "The dog made a noisy bark." +print( + compute_cosine_similarity( + text_embeddings_dict[dog_text_1], text_embeddings_dict[dog_text_2] + ) +) + +pizza_text_1 = "He ate a lot of pizza." +pizza_text_2 = "He devoured a large quantity of pizza pie." +print( + compute_cosine_similarity( + text_embeddings_dict[pizza_text_1], text_embeddings_dict[pizza_text_2] + ) +) + +print( + compute_cosine_similarity( + text_embeddings_dict[dog_text_1], text_embeddings_dict[pizza_text_1] + ) +) diff --git a/embeddings_chromadb/word_vectors.py b/embeddings_chromadb/word_vectors.py new file mode 100644 index 0000000000..f3ddb9e2c9 --- /dev/null +++ b/embeddings_chromadb/word_vectors.py @@ -0,0 +1,34 @@ +import spacy +from cosine_similarity import compute_cosine_similarity + +# Load the medium-size English model +nlp = spacy.load("en_core_web_md") + +# Get the word vector for the word "dog" +dog_embedding = nlp.vocab["dog"].vector + +# Word vectors are stored as NumPy arrays +print(type(dog_embedding)) + +# Word vector dimension +print(dog_embedding.shape) + +# First 10 elements of the "dog" word vector +print(dog_embedding[0:10]) + +dog_embedding = nlp.vocab["dog"].vector +cat_embedding = nlp.vocab["cat"].vector +apple_embedding = nlp.vocab["apple"].vector +tasty_embedding = nlp.vocab["tasty"].vector +delicious_embedding = nlp.vocab["delicious"].vector +truck_embedding = nlp.vocab["truck"].vector + +print(compute_cosine_similarity(dog_embedding, cat_embedding)) + +print(compute_cosine_similarity(delicious_embedding, tasty_embedding)) + +print(compute_cosine_similarity(apple_embedding, delicious_embedding)) + +print(compute_cosine_similarity(dog_embedding, apple_embedding)) + +print(compute_cosine_similarity(truck_embedding, delicious_embedding)) From 76c848a09f64e77e9bd5dc8d273e333a4918be53 Mon Sep 17 00:00:00 2001 From: hfhoffman1144 Date: Thu, 19 Oct 2023 08:23:49 -0500 Subject: [PATCH 2/8] renamed folder to slug --- .../README.md | 0 .../car_data_etl.py | 0 .../chroma_utils.py | 0 .../config.json | 0 .../cosine_similarity.py | 0 .../create_car_review_collection.py | 0 .../intro_to_vectors.py | 0 .../llm_car_review_context.py | 0 .../text_embeddings.py | 0 .../word_vectors.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/README.md (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/car_data_etl.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/chroma_utils.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/config.json (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/cosine_similarity.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/create_car_review_collection.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/intro_to_vectors.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/llm_car_review_context.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/text_embeddings.py (100%) rename {embeddings_chromadb => embeddings-and-vector-databases-with-chromadb}/word_vectors.py (100%) diff --git a/embeddings_chromadb/README.md b/embeddings-and-vector-databases-with-chromadb/README.md similarity index 100% rename from embeddings_chromadb/README.md rename to embeddings-and-vector-databases-with-chromadb/README.md diff --git a/embeddings_chromadb/car_data_etl.py b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py similarity index 100% rename from embeddings_chromadb/car_data_etl.py rename to embeddings-and-vector-databases-with-chromadb/car_data_etl.py diff --git a/embeddings_chromadb/chroma_utils.py b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py similarity index 100% rename from embeddings_chromadb/chroma_utils.py rename to embeddings-and-vector-databases-with-chromadb/chroma_utils.py diff --git a/embeddings_chromadb/config.json b/embeddings-and-vector-databases-with-chromadb/config.json similarity index 100% rename from embeddings_chromadb/config.json rename to embeddings-and-vector-databases-with-chromadb/config.json diff --git a/embeddings_chromadb/cosine_similarity.py b/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py similarity index 100% rename from embeddings_chromadb/cosine_similarity.py rename to embeddings-and-vector-databases-with-chromadb/cosine_similarity.py diff --git a/embeddings_chromadb/create_car_review_collection.py b/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py similarity index 100% rename from embeddings_chromadb/create_car_review_collection.py rename to embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py diff --git a/embeddings_chromadb/intro_to_vectors.py b/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py similarity index 100% rename from embeddings_chromadb/intro_to_vectors.py rename to embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py diff --git a/embeddings_chromadb/llm_car_review_context.py b/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py similarity index 100% rename from embeddings_chromadb/llm_car_review_context.py rename to embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py diff --git a/embeddings_chromadb/text_embeddings.py b/embeddings-and-vector-databases-with-chromadb/text_embeddings.py similarity index 100% rename from embeddings_chromadb/text_embeddings.py rename to embeddings-and-vector-databases-with-chromadb/text_embeddings.py diff --git a/embeddings_chromadb/word_vectors.py b/embeddings-and-vector-databases-with-chromadb/word_vectors.py similarity index 100% rename from embeddings_chromadb/word_vectors.py rename to embeddings-and-vector-databases-with-chromadb/word_vectors.py From 5d419cddcb097743632ee9e53b4ba254ad3d8eb3 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 20 Oct 2023 11:41:33 +0200 Subject: [PATCH 3/8] Add requirements file --- .../README.md | 4 + .../requirements.txt | 92 +++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 embeddings-and-vector-databases-with-chromadb/requirements.txt diff --git a/embeddings-and-vector-databases-with-chromadb/README.md b/embeddings-and-vector-databases-with-chromadb/README.md index 6f116901b0..429fc348dc 100644 --- a/embeddings-and-vector-databases-with-chromadb/README.md +++ b/embeddings-and-vector-databases-with-chromadb/README.md @@ -4,4 +4,8 @@ Supporting code for the Real Python tutorial [Embeddings and Vector Databases Wi To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment. +You can install the dependencies manually, or by running: +``` +(venv) $ python -m pip install -r requirements.txt +``` diff --git a/embeddings-and-vector-databases-with-chromadb/requirements.txt b/embeddings-and-vector-databases-with-chromadb/requirements.txt new file mode 100644 index 0000000000..48d2e1ed1c --- /dev/null +++ b/embeddings-and-vector-databases-with-chromadb/requirements.txt @@ -0,0 +1,92 @@ +aiohttp==3.8.6 +aiosignal==1.3.1 +annotated-types==0.6.0 +anyio==3.7.1 +async-timeout==4.0.3 +attrs==23.1.0 +backoff==2.2.1 +bcrypt==4.0.1 +blis==0.7.11 +catalogue==2.0.10 +certifi==2023.7.22 +charset-normalizer==3.3.0 +chroma-hnswlib==0.7.3 +chromadb==0.4.14 +click==8.1.7 +cloudpathlib==0.16.0 +coloredlogs==15.0.1 +confection==0.1.3 +cymem==2.0.8 +fastapi==0.104.0 +filelock==3.12.4 +flatbuffers==23.5.26 +frozenlist==1.4.0 +fsspec==2023.9.2 +grpcio==1.59.0 +h11==0.14.0 +httptools==0.6.1 +huggingface-hub==0.17.3 +humanfriendly==10.0 +idna==3.4 +importlib-resources==6.1.0 +Jinja2==3.1.2 +joblib==1.3.2 +langcodes==3.3.0 +MarkupSafe==2.1.3 +monotonic==1.6 +more-itertools==10.1.0 +mpmath==1.3.0 +multidict==6.0.4 +murmurhash==1.0.10 +networkx==3.2 +nltk==3.8.1 +numpy==1.26.1 +onnxruntime==1.16.1 +openai==0.28.1 +overrides==7.4.0 +packaging==23.2 +Pillow==10.1.0 +polars==0.19.9 +posthog==3.0.2 +preshed==3.0.9 +protobuf==4.24.4 +pulsar-client==3.3.0 +pydantic==2.4.2 +pydantic_core==2.10.1 +PyPika==0.48.9 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +PyYAML==6.0.1 +regex==2023.10.3 +requests==2.31.0 +safetensors==0.4.0 +scikit-learn==1.3.1 +scipy==1.11.3 +sentence-transformers==2.2.2 +sentencepiece==0.1.99 +six==1.16.0 +smart-open==6.4.0 +sniffio==1.3.0 +spacy==3.7.2 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 +starlette==0.27.0 +sympy==1.12 +thinc==8.2.1 +threadpoolctl==3.2.0 +tokenizers==0.14.1 +torch==2.1.0 +torchvision==0.16.0 +tqdm==4.66.1 +transformers==4.34.1 +typer==0.9.0 +typing_extensions==4.8.0 +urllib3==2.0.7 +uvicorn==0.23.2 +uvloop==0.18.0 +wasabi==1.1.2 +watchfiles==0.21.0 +weasel==0.3.3 +websockets==11.0.3 +yarl==1.9.2 From a33d8d9c6d4f863c72db7b15a2293ae00bb79439 Mon Sep 17 00:00:00 2001 From: martin-martin Date: Fri, 20 Oct 2023 11:42:13 +0200 Subject: [PATCH 4/8] Apply isort formatting --- .../car_data_etl.py | 1 + .../chroma_utils.py | 1 + .../create_car_review_collection.py | 1 + .../llm_car_review_context.py | 5 +++-- .../text_embeddings.py | 1 + .../word_vectors.py | 1 + 6 files changed, 8 insertions(+), 2 deletions(-) diff --git a/embeddings-and-vector-databases-with-chromadb/car_data_etl.py b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py index b454a4e19b..1026bc9f2d 100644 --- a/embeddings-and-vector-databases-with-chromadb/car_data_etl.py +++ b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py @@ -1,4 +1,5 @@ import pathlib + import polars as pl diff --git a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py index e1afd386c6..e565177743 100644 --- a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py +++ b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py @@ -1,4 +1,5 @@ import pathlib + import chromadb from chromadb.utils import embedding_functions from more_itertools import batched diff --git a/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py b/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py index 1e7c4643fb..929c9a2522 100644 --- a/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py +++ b/embeddings-and-vector-databases-with-chromadb/create_car_review_collection.py @@ -1,5 +1,6 @@ import chromadb from chromadb.utils import embedding_functions + from car_data_etl import prepare_car_reviews_data from chroma_utils import build_chroma_collection diff --git a/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py b/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py index f648a9940e..cc9bff4112 100644 --- a/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py +++ b/embeddings-and-vector-databases-with-chromadb/llm_car_review_context.py @@ -1,7 +1,8 @@ -import os import json -import openai +import os + import chromadb +import openai from chromadb.utils import embedding_functions os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/embeddings-and-vector-databases-with-chromadb/text_embeddings.py b/embeddings-and-vector-databases-with-chromadb/text_embeddings.py index 073416f357..adce790e63 100644 --- a/embeddings-and-vector-databases-with-chromadb/text_embeddings.py +++ b/embeddings-and-vector-databases-with-chromadb/text_embeddings.py @@ -1,4 +1,5 @@ from sentence_transformers import SentenceTransformer + from cosine_similarity import compute_cosine_similarity model = SentenceTransformer("all-MiniLM-L6-v2") diff --git a/embeddings-and-vector-databases-with-chromadb/word_vectors.py b/embeddings-and-vector-databases-with-chromadb/word_vectors.py index f3ddb9e2c9..ccaed6e0a5 100644 --- a/embeddings-and-vector-databases-with-chromadb/word_vectors.py +++ b/embeddings-and-vector-databases-with-chromadb/word_vectors.py @@ -1,4 +1,5 @@ import spacy + from cosine_similarity import compute_cosine_similarity # Load the medium-size English model From 32e1ce08b38798544f896bd4405843fae8c0f553 Mon Sep 17 00:00:00 2001 From: eyrei123 <88923476+eyrei123@users.noreply.github.com> Date: Tue, 31 Oct 2023 20:33:48 +0000 Subject: [PATCH 5/8] Update Handling Exceptions.md (#455) Three changed t oTwo --- jupyter-lab-files/Handling Exceptions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jupyter-lab-files/Handling Exceptions.md b/jupyter-lab-files/Handling Exceptions.md index 61b033e904..528c72fc85 100644 --- a/jupyter-lab-files/Handling Exceptions.md +++ b/jupyter-lab-files/Handling Exceptions.md @@ -24,7 +24,7 @@ This is how you stop the exception from crashing the code. ## What Are Some Common Exceptions The Python language supports more than sixty common exceptions. -Three of the more common are: +Two of the more common are: | Exception | Cause | @@ -43,4 +43,4 @@ except ZeroDivisionError: print("You can't divide by zero") except ValueError: print("You must supply a number") -``` \ No newline at end of file +``` From 393b2c0517085fbbc6279b7e8bd764aada20731c Mon Sep 17 00:00:00 2001 From: KateFinegan <95366190+KateFinegan@users.noreply.github.com> Date: Tue, 31 Oct 2023 19:15:42 -0600 Subject: [PATCH 6/8] Language edit --- embeddings-and-vector-databases-with-chromadb/chroma_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py index e565177743..b45f12c934 100644 --- a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py +++ b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py @@ -28,9 +28,9 @@ def build_chroma_collection( metadata={"hnsw:space": distance_func_name}, ) - document_indicies = list(range(len(documents))) + document_indices = list(range(len(documents))) - for batch in batched(document_indicies, 166): + for batch in batched(document_indices, 166): start_idx = batch[0] end_idx = batch[-1] From 51024491a7aab48c0cda4ea1f01e70b18ea3b3e8 Mon Sep 17 00:00:00 2001 From: eyrei123 <88923476+eyrei123@users.noreply.github.com> Date: Wed, 1 Nov 2023 07:46:40 +0000 Subject: [PATCH 7/8] Update README.md (#456) * Update README.md Addition of source references. * Final QA --------- Co-authored-by: Geir Arne Hjelle --- jupyter-lab-files/Movies.json | 2 +- jupyter-lab-files/README.md | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/jupyter-lab-files/Movies.json b/jupyter-lab-files/Movies.json index 4d3293f2c2..1ca4422af8 100644 --- a/jupyter-lab-files/Movies.json +++ b/jupyter-lab-files/Movies.json @@ -9,7 +9,7 @@ { "id": 1, "title": "Pal Joey", - "release_year": 1953, + "release_year": 1957, "star": "Francis Albert Sinatra" }, { diff --git a/jupyter-lab-files/README.md b/jupyter-lab-files/README.md index 1462755f5c..4d13055ef8 100644 --- a/jupyter-lab-files/README.md +++ b/jupyter-lab-files/README.md @@ -11,3 +11,17 @@ The easiest way to incorporate these files into JupyterLab is to copy them into ## Usage After creating and activating your virtual environment, installing the dependencies, and starting the JupyterLab server, you can load any file by double-clicking on it from the _file browser_ in JupyterLab. + +## Information Sources + +The `howto-regex.pdf` file is a PDF copy of an [article](https://docs.python.org/3/howto/regex.html) published as part of the Python documentation. The PDF version itself was sourced from the FOSSIES [open source archive](https://fossies.org/linux/python-docs-pdf-a4/howto-regex.pdf). + +The population data used in this article was sourced from the [Worldometers free reference web site](https://www.worldometers.info/world-population/world-population-by-year/). + +Information on the effects of the changes in the world's population was sourced in the [free learning resources](https://ugc.berkeley.edu/background-content/population-growth/) provided by the University of California. + +All movie data was sourced from various Wikipedia articles: [List of James Bond films](https://en.wikipedia.org/wiki/List_of_James_Bond_films), [Dr. No](https://en.wikipedia.org/wiki/Dr._No_(film)), [Pal Joey](https://en.wikipedia.org/wiki/Pal_Joey_(film)), and [The Godfather](https://en.wikipedia.org/wiki/The_Godfather). + +`Glasgow.jpg` is a photograph of George Square in Glasgow. (C) Ian Eyre + +All other content was created specifically for this article. From d9a13230799e3a54d57b7402b034ea56fcc3acf5 Mon Sep 17 00:00:00 2001 From: Geir Arne Hjelle Date: Tue, 7 Nov 2023 16:45:30 +0100 Subject: [PATCH 8/8] Final QA (#457) --- embeddings-and-vector-databases-with-chromadb/README.md | 2 +- embeddings-and-vector-databases-with-chromadb/car_data_etl.py | 4 +--- embeddings-and-vector-databases-with-chromadb/chroma_utils.py | 4 ++-- .../cosine_similarity.py | 2 +- .../intro_to_vectors.py | 2 +- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/embeddings-and-vector-databases-with-chromadb/README.md b/embeddings-and-vector-databases-with-chromadb/README.md index 429fc348dc..ee0d1636d0 100644 --- a/embeddings-and-vector-databases-with-chromadb/README.md +++ b/embeddings-and-vector-databases-with-chromadb/README.md @@ -1,6 +1,6 @@ # Embeddings and Vector Databases With ChromaDB -Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/embeddings-and-vector-databases-with-chromadb/). +Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/chromadb-vector-database/). To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment. diff --git a/embeddings-and-vector-databases-with-chromadb/car_data_etl.py b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py index 1026bc9f2d..3f8fdb171b 100644 --- a/embeddings-and-vector-databases-with-chromadb/car_data_etl.py +++ b/embeddings-and-vector-databases-with-chromadb/car_data_etl.py @@ -57,6 +57,4 @@ def prepare_car_reviews_data( documents = car_review_db_data["Review"].to_list() metadatas = car_review_db_data.drop("Review").to_dicts() - chroma_data = {"ids": ids, "documents": documents, "metadatas": metadatas} - - return chroma_data + return {"ids": ids, "documents": documents, "metadatas": metadatas} diff --git a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py index b45f12c934..253c191a1f 100644 --- a/embeddings-and-vector-databases-with-chromadb/chroma_utils.py +++ b/embeddings-and-vector-databases-with-chromadb/chroma_utils.py @@ -8,7 +8,7 @@ def build_chroma_collection( chroma_path: pathlib.Path, collection_name: str, - embbeding_func_name: str, + embedding_func_name: str, ids: list[str], documents: list[str], metadatas: list[dict], @@ -19,7 +19,7 @@ def build_chroma_collection( chroma_client = chromadb.PersistentClient(chroma_path) embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( - model_name=embbeding_func_name + model_name=embedding_func_name ) collection = chroma_client.create_collection( diff --git a/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py b/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py index cb3a416edd..bc642bf95b 100644 --- a/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py +++ b/embeddings-and-vector-databases-with-chromadb/cosine_similarity.py @@ -4,4 +4,4 @@ def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float: """Compute the cosine similarity between two vectors""" - return u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v)) + return (u @ v) / (np.linalg.norm(u) * np.linalg.norm(v)) diff --git a/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py b/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py index 2659a23aeb..9d1aad61f3 100644 --- a/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py +++ b/embeddings-and-vector-databases-with-chromadb/intro_to_vectors.py @@ -20,4 +20,4 @@ # Dot product print(np.sum(v1 * v2)) -print(v1.dot(v3)) +print(v1 @ v3)