added retrival related scripts and other experiments

dice-group · Oct 27, 2024 · 81de4cd · 81de4cd
1 parent 7e0e60a
commit 81de4cd
Show file tree

Hide file tree

Showing 12 changed files with 534 additions and 0 deletions.
diff --git a/general_working_directory/question_generation.py b/general_working_directory/question_generation.py
@@ -0,0 +1,70 @@
+import base64
+
+import aiohttp
+import asyncio
+import time
+from openai import OpenAI
+from owlapy.iri import IRI
+from owlapy.owl_ontology_manager import OntologyManager
+from owlapy.owl_property import OWLDataProperty
+from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
+from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF
+from rdflib.namespace import XSD
+from owlapy.owl_individual import OWLNamedIndividual
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+manager = OntologyManager()
+ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
+base_reasoner = OntologyReasoner(ontology)
+reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
+dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName"))
+dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
+dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))
+image_iri_as_str = "http://example.org/image_25521"
+image_ind = OWLNamedIndividual(image_iri_as_str)
+
+image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal())
+llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
+base64_image = encode_image(image_filename)
+all_descriptions = ""
+
+# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system."
+#                             "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:"
+#                             f"{all_descriptions}"
+#                             f"{llm_description}"
+
+# "Only write the query which should be a question and always end with a questionmark."
+for d in list(reasoner.data_property_values(image_ind,dprop2)):
+    all_descriptions = all_descriptions + d.get_literal() + "\n"
+
+client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
+print(client.chat.completions.create(
+    model="tentris",
+    messages=[
+        {
+        "role": "user",
+        "content":
+            [
+                {
+                    "type": "text",
+                    "text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system."
+                            "Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:"
+                            f"{all_descriptions}"
+                            f"{llm_description}" 
+                            "Only write the query which should be a question and always end with a questionmark."
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                }
+            ]
+        }
+    ],
+    temperature=0.1,
+    seed=1
+).choices[0].message.content)
diff --git a/local_working_directory/000b3a87508b0fa185fbd53ecbe2e4c6.jpg b/local_working_directory/000b3a87508b0fa185fbd53ecbe2e4c6.jpg
diff --git a/local_working_directory/000c9b4926cd78edd4c19cbc6beba111.jpg b/local_working_directory/000c9b4926cd78edd4c19cbc6beba111.jpg
diff --git a/local_working_directory/000e973c99dc090afd7898c93daf0dbc.jpg b/local_working_directory/000e973c99dc090afd7898c93daf0dbc.jpg
diff --git a/local_working_directory/bm25_fast.py b/local_working_directory/bm25_fast.py
@@ -0,0 +1,66 @@
+""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
+Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/)
+"""
+
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy import sparse
+import rdflib
+
+class BM25(object):
+    def __init__(self, b=0.75, k1=1.6):
+        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
+        self.b = b
+        self.k1 = k1
+
+    def fit(self, X):
+        """ Fit IDF to documents X """
+        self.vectorizer.fit(X)
+        y = super(TfidfVectorizer, self.vectorizer).transform(X)
+        self.avdl = y.sum(1).mean()
+
+    def transform(self, q, X):
+        """ Calculate BM25 between query q and documents X """
+        b, k1, avdl = self.b, self.k1, self.avdl
+
+        # apply CountVectorizer
+        X = super(TfidfVectorizer, self.vectorizer).transform(X)
+        len_X = X.sum(1).A1
+        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
+        assert sparse.isspmatrix_csr(q)
+
+        # convert to csc for better column slicing
+        X = X.tocsc()[:, q.indices]
+        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
+        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
+        # to idf(t) = log [ n / df(t) ] with minus 1
+        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
+        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
+        return (numer / denom).sum(1).A1
+
+
+
+#------------ End of library impl. Followings are the example -----------------
+from sklearn.datasets import fetch_20newsgroups
+# documents = fetch_20newsgroups(subset='train').data
+g = rdflib.Graph()
+g.parse("fashionpedia-third-generation.owl", format="xml")
+
+# Extract triplets
+triplets = []
+cn = 0
+for subj, pred, obj in g:
+    triplets.append((str(subj), str(pred), str(obj)))
+    # cn += 1
+    # if cn > 10000:
+    #     break
+print(len(triplets))
+# Index the data (convert triplets to text format)
+documents = ["\n".join(triplet) for triplet in triplets]
+
+bm25 = BM25()
+bm25.fit(documents)
+# Find the similar documents given  query
+query = "What are some clothes containing blue tshirt with long sleeves?"
+scores = bm25.transform(query, documents)
+print(documents[np.argmax(scores)])
diff --git a/local_working_directory/bm25_retriever_example.py b/local_working_directory/bm25_retriever_example.py
@@ -0,0 +1,40 @@
+import os
+
+os.environ["OPENAI_API_KEY"] = "token-tentris-upb"
+
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import SimpleDirectoryReader
+from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.core.response.notebook_utils import display_source_node
+import Stemmer
+
+Settings.llm = OpenAI(model="gpt-3.5-turbo")
+Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
+
+# load documents
+documents = SimpleDirectoryReader("./data/text-from-kg").load_data()
+
+# initialize node parser
+splitter = SentenceSplitter(chunk_size=512)
+nodes = splitter.get_nodes_from_documents(documents)
+# We can pass in the index, docstore, or list of nodes to create the retriever
+bm25_retriever = BM25Retriever.from_defaults(
+    nodes=nodes,
+    similarity_top_k=2,
+    # Optional: We can pass in the stemmer and set the language for stopwords
+    # This is important for removing stopwords and stemming the query + text
+    # The default is english for both
+    stemmer=Stemmer.Stemmer("english"),
+    language="english",
+)
+
+retrieved_nodes = bm25_retriever.retrieve(
+    "What are some clothing options that include a black leather jacket with a zip-up front, above-the-hip length, asymmetrical silhouette, and biker style?"
+)
+for node in retrieved_nodes:
+    print(node.text)
+
+
diff --git a/local_working_directory/bm25_retriever_kg_data.py b/local_working_directory/bm25_retriever_kg_data.py
@@ -0,0 +1,29 @@
+import os
+
+os.environ["OPENAI_API_KEY"] = "token-tentris-upb"
+import rdflib
+from llama_index.retrievers.bm25 import BM25Retriever
+from llama_index.core import Settings, Document
+from llama_index.core.storage.docstore.simple_docstore import SimpleDocumentStore
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+import Stemmer
+
+
+Settings.llm = OpenAI(model="gpt-3.5-turbo")
+Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
+
+g = rdflib.Graph()
+g.parse("fashionpedia-second-generation.owl", format="xml")
+
+# Extract triplets
+triplets = []
+for subj, pred, obj in g:
+    triplets.append((str(subj), str(pred), str(obj)))
+
+# Index the data (convert triplets to text format)
+text_data = ["\n".join(triplet) for triplet in triplets]
+
+with open('fashionpedia-second-generation.txt', 'w') as file:
+    file.writelines(text_data)
diff --git a/local_working_directory/recommendation_script1.py b/local_working_directory/recommendation_script1.py
@@ -0,0 +1,86 @@
+from PIL import Image
+import matplotlib.pyplot as plt
+import numpy as np
+from owlapy.iri import IRI
+from owlapy.owl_individual import OWLNamedIndividual
+from owlapy.owl_ontology_manager import OntologyManager
+from owlapy.owl_property import OWLDataProperty
+from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
+from transformers import AutoModel
+import json
+import torch
+import pandas as pd
+import torch.nn.functional as F
+import polars as pl
+
+k = 3
+# Set to None if you want to read all
+nrows = None
+
+with open('../image-filename-mappings.json', 'r') as file:
+    filename_of = json.load(file)
+
+manager = OntologyManager()
+ontology = manager.load_ontology(IRI.create("file://fashionpedia-second-generation.owl"))
+base_reasoner = OntologyReasoner(ontology)
+reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
+has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
+
+
+# (1) Load the csv file fashionpedia-embeddings.csv". 
+print("Reading embeddings", end="\t")
+df = pd.read_csv("../fashionpedia-embeddings.csv", index_col=0, nrows=nrows)
+print(df.shape)
+# (2) D a matrix each row represents an embedding vector
+document_embeddings = F.normalize(torch.from_numpy(df.values).type(torch.float32), p=2, dim=1)
+# (3)
+document_ordered_names = df.index.values.tolist()
+# (4) Initialize the embedder
+print("Loading embedding model", end="\t")
+model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
+                                  torch_dtype=torch.bfloat16)
+# model = model
+while True:
+    query = input('What do you like to wear?\n')  # "I like a dress with wide neckline"
+    print(f"QUERY:{query}")
+    # query_embeddings: np.ndarray: torchFloatTensor: dim x 1
+    query_embeddings = torch.from_numpy(model.encode(query))
+    query_embeddings = F.normalize(query_embeddings.reshape(len(query_embeddings), -1), p=2, dim=0)
+
+    similarities = (document_embeddings @ query_embeddings).flatten()
+
+    top_scores, top_k_indices = torch.topk(similarities, k)
+    top_k_indices = top_k_indices.cpu().numpy()
+    # Plot k images given user's query.
+    seen_set = set()
+    pos_set = dict()
+    neg_set = dict()
+    for i in top_k_indices:
+        # Text Preprocess
+        try:
+            image = document_ordered_names[i][:-2]
+            filename = filename_of[image]
+            if filename not in seen_set:
+                seen_set.add(filename)
+                img = np.asarray(Image.open(f"../images/{filename}"))
+                plt.imshow(img)
+                plt.show()
+
+                ind = OWLNamedIndividual("http://example.org/" + image)
+                all_desc = list(reasoner.data_property_values(ind, has_description))
+                selected_desc = all_desc[int(document_ordered_names[i][-1]) - 1]
+                print(selected_desc.get_literal())
+
+                feedback = input('Does this image contain something that fit your preferences? (y/n)\n')
+                if feedback == "y":
+                    pos_set[image] = selected_desc.get_literal()
+                elif feedback == "n":
+                    neg_set[image] = selected_desc.get_literal()
+                else:
+                    print('Neutral selected')
+
+        except KeyError:
+            print(f"{i} not found")
+
+    print(f"Positive examples: {pos_set}")
+    print(f"Negative examples: {neg_set}")
diff --git a/local_working_directory/single_question_fragmentation.py b/local_working_directory/single_question_fragmentation.py
@@ -0,0 +1,21 @@
+from openai import OpenAI
+
+client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
+print(client.chat.completions.create(
+    model="tentris",
+    messages=[
+        {
+        "role": "user",
+        "content":
+            [
+                {
+                    "type": "text",
+                    "text": "Can you separate each part of the following question into self-contained questions:"
+                             "What are some clothing options that include a black leather jacket, a light blue denim shirt with a white collar, and black pants, suitable for a casual yet edgy style?"
+                },
+            ]
+        }
+    ],
+    temperature=0.1,
+    seed=1
+).choices[0].message.content)
diff --git a/local_working_directory/single_question_generation.py b/local_working_directory/single_question_generation.py
@@ -0,0 +1,70 @@
+import base64
+
+import aiohttp
+import asyncio
+import time
+from openai import OpenAI
+from owlapy.iri import IRI
+from owlapy.owl_ontology_manager import OntologyManager
+from owlapy.owl_property import OWLDataProperty
+from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
+from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF
+from rdflib.namespace import XSD
+from owlapy.owl_individual import OWLNamedIndividual
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+manager = OntologyManager()
+ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
+base_reasoner = OntologyReasoner(ontology)
+reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
+dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName"))
+dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
+dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))
+image_iri_as_str = "http://example.org/image_25521"
+image_ind = OWLNamedIndividual(image_iri_as_str)
+
+image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal())
+llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
+base64_image = encode_image(image_filename)
+all_descriptions = ""
+
+# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system."
+#                             "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:"
+#                             f"{all_descriptions}"
+#                             f"{llm_description}"
+
+# "Only write the query which should be a question and always end with a questionmark."
+for d in list(reasoner.data_property_values(image_ind,dprop2)):
+    all_descriptions = all_descriptions + d.get_literal() + "\n"
+
+client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
+print(client.chat.completions.create(
+    model="tentris",
+    messages=[
+        {
+        "role": "user",
+        "content":
+            [
+                {
+                    "type": "text",
+                    "text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system."
+                            "Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:"
+                            f"{all_descriptions}"
+                            f"{llm_description}" 
+                            "Only write the query which should be a question and always end with a questionmark."
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                }
+            ]
+        }
+    ],
+    temperature=0.1,
+    seed=1
+).choices[0].message.content)