diff --git a/general_working_directory/question_generation.py b/general_working_directory/question_generation.py new file mode 100644 index 0000000..af10ed4 --- /dev/null +++ b/general_working_directory/question_generation.py @@ -0,0 +1,70 @@ +import base64 + +import aiohttp +import asyncio +import time +from openai import OpenAI +from owlapy.iri import IRI +from owlapy.owl_ontology_manager import OntologyManager +from owlapy.owl_property import OWLDataProperty +from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner +from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF +from rdflib.namespace import XSD +from owlapy.owl_individual import OWLNamedIndividual + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + +manager = OntologyManager() +ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl")) +base_reasoner = OntologyReasoner(ontology) +reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) +dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName")) +dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription")) +dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription")) +image_iri_as_str = "http://example.org/image_25521" +image_ind = OWLNamedIndividual(image_iri_as_str) + +image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal()) +llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal()) +base64_image = encode_image(image_filename) +all_descriptions = "" + +# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system." +# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:" +# f"{all_descriptions}" +# f"{llm_description}" + +# "Only write the query which should be a question and always end with a questionmark." +for d in list(reasoner.data_property_values(image_ind,dprop2)): + all_descriptions = all_descriptions + d.get_literal() + "\n" + +client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") +print(client.chat.completions.create( + model="tentris", + messages=[ + { + "role": "user", + "content": + [ + { + "type": "text", + "text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system." + "Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:" + f"{all_descriptions}" + f"{llm_description}" + "Only write the query which should be a question and always end with a questionmark." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], + temperature=0.1, + seed=1 +).choices[0].message.content) diff --git a/local_working_directory/000b3a87508b0fa185fbd53ecbe2e4c6.jpg b/local_working_directory/000b3a87508b0fa185fbd53ecbe2e4c6.jpg new file mode 100644 index 0000000..082d661 Binary files /dev/null and b/local_working_directory/000b3a87508b0fa185fbd53ecbe2e4c6.jpg differ diff --git a/local_working_directory/000c9b4926cd78edd4c19cbc6beba111.jpg b/local_working_directory/000c9b4926cd78edd4c19cbc6beba111.jpg new file mode 100644 index 0000000..81a29b8 Binary files /dev/null and b/local_working_directory/000c9b4926cd78edd4c19cbc6beba111.jpg differ diff --git a/local_working_directory/000e973c99dc090afd7898c93daf0dbc.jpg b/local_working_directory/000e973c99dc090afd7898c93daf0dbc.jpg new file mode 100644 index 0000000..09daa1d Binary files /dev/null and b/local_working_directory/000e973c99dc090afd7898c93daf0dbc.jpg differ diff --git a/local_working_directory/bm25_fast.py b/local_working_directory/bm25_fast.py new file mode 100644 index 0000000..0b169e6 --- /dev/null +++ b/local_working_directory/bm25_fast.py @@ -0,0 +1,66 @@ +""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer +Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/) +""" + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from scipy import sparse +import rdflib + +class BM25(object): + def __init__(self, b=0.75, k1=1.6): + self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False) + self.b = b + self.k1 = k1 + + def fit(self, X): + """ Fit IDF to documents X """ + self.vectorizer.fit(X) + y = super(TfidfVectorizer, self.vectorizer).transform(X) + self.avdl = y.sum(1).mean() + + def transform(self, q, X): + """ Calculate BM25 between query q and documents X """ + b, k1, avdl = self.b, self.k1, self.avdl + + # apply CountVectorizer + X = super(TfidfVectorizer, self.vectorizer).transform(X) + len_X = X.sum(1).A1 + q, = super(TfidfVectorizer, self.vectorizer).transform([q]) + assert sparse.isspmatrix_csr(q) + + # convert to csc for better column slicing + X = X.tocsc()[:, q.indices] + denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None] + # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted + # to idf(t) = log [ n / df(t) ] with minus 1 + idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1. + numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1) + return (numer / denom).sum(1).A1 + + + +#------------ End of library impl. Followings are the example ----------------- +from sklearn.datasets import fetch_20newsgroups +# documents = fetch_20newsgroups(subset='train').data +g = rdflib.Graph() +g.parse("fashionpedia-third-generation.owl", format="xml") + +# Extract triplets +triplets = [] +cn = 0 +for subj, pred, obj in g: + triplets.append((str(subj), str(pred), str(obj))) + # cn += 1 + # if cn > 10000: + # break +print(len(triplets)) +# Index the data (convert triplets to text format) +documents = ["\n".join(triplet) for triplet in triplets] + +bm25 = BM25() +bm25.fit(documents) +# Find the similar documents given query +query = "What are some clothes containing blue tshirt with long sleeves?" +scores = bm25.transform(query, documents) +print(documents[np.argmax(scores)]) diff --git a/local_working_directory/bm25_retriever_example.py b/local_working_directory/bm25_retriever_example.py new file mode 100644 index 0000000..de47b42 --- /dev/null +++ b/local_working_directory/bm25_retriever_example.py @@ -0,0 +1,40 @@ +import os + +os.environ["OPENAI_API_KEY"] = "token-tentris-upb" + +from llama_index.core import Settings +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.core.node_parser import SentenceSplitter +from llama_index.core import SimpleDirectoryReader +from llama_index.retrievers.bm25 import BM25Retriever +from llama_index.core.response.notebook_utils import display_source_node +import Stemmer + +Settings.llm = OpenAI(model="gpt-3.5-turbo") +Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") + +# load documents +documents = SimpleDirectoryReader("./data/text-from-kg").load_data() + +# initialize node parser +splitter = SentenceSplitter(chunk_size=512) +nodes = splitter.get_nodes_from_documents(documents) +# We can pass in the index, docstore, or list of nodes to create the retriever +bm25_retriever = BM25Retriever.from_defaults( + nodes=nodes, + similarity_top_k=2, + # Optional: We can pass in the stemmer and set the language for stopwords + # This is important for removing stopwords and stemming the query + text + # The default is english for both + stemmer=Stemmer.Stemmer("english"), + language="english", +) + +retrieved_nodes = bm25_retriever.retrieve( + "What are some clothing options that include a black leather jacket with a zip-up front, above-the-hip length, asymmetrical silhouette, and biker style?" +) +for node in retrieved_nodes: + print(node.text) + + diff --git a/local_working_directory/bm25_retriever_kg_data.py b/local_working_directory/bm25_retriever_kg_data.py new file mode 100644 index 0000000..548a6ac --- /dev/null +++ b/local_working_directory/bm25_retriever_kg_data.py @@ -0,0 +1,29 @@ +import os + +os.environ["OPENAI_API_KEY"] = "token-tentris-upb" +import rdflib +from llama_index.retrievers.bm25 import BM25Retriever +from llama_index.core import Settings, Document +from llama_index.core.storage.docstore.simple_docstore import SimpleDocumentStore +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.core.node_parser import SentenceSplitter +import Stemmer + + +Settings.llm = OpenAI(model="gpt-3.5-turbo") +Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") + +g = rdflib.Graph() +g.parse("fashionpedia-second-generation.owl", format="xml") + +# Extract triplets +triplets = [] +for subj, pred, obj in g: + triplets.append((str(subj), str(pred), str(obj))) + +# Index the data (convert triplets to text format) +text_data = ["\n".join(triplet) for triplet in triplets] + +with open('fashionpedia-second-generation.txt', 'w') as file: + file.writelines(text_data) diff --git a/local_working_directory/recommendation_script1.py b/local_working_directory/recommendation_script1.py new file mode 100644 index 0000000..3c86d90 --- /dev/null +++ b/local_working_directory/recommendation_script1.py @@ -0,0 +1,86 @@ +from PIL import Image +import matplotlib.pyplot as plt +import numpy as np +from owlapy.iri import IRI +from owlapy.owl_individual import OWLNamedIndividual +from owlapy.owl_ontology_manager import OntologyManager +from owlapy.owl_property import OWLDataProperty +from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner +from transformers import AutoModel +import json +import torch +import pandas as pd +import torch.nn.functional as F +import polars as pl + +k = 3 +# Set to None if you want to read all +nrows = None + +with open('../image-filename-mappings.json', 'r') as file: + filename_of = json.load(file) + +manager = OntologyManager() +ontology = manager.load_ontology(IRI.create("file://fashionpedia-second-generation.owl")) +base_reasoner = OntologyReasoner(ontology) +reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) +has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription")) + + +# (1) Load the csv file fashionpedia-embeddings.csv". +print("Reading embeddings", end="\t") +df = pd.read_csv("../fashionpedia-embeddings.csv", index_col=0, nrows=nrows) +print(df.shape) +# (2) D a matrix each row represents an embedding vector +document_embeddings = F.normalize(torch.from_numpy(df.values).type(torch.float32), p=2, dim=1) +# (3) +document_ordered_names = df.index.values.tolist() +# (4) Initialize the embedder +print("Loading embedding model", end="\t") +model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, + torch_dtype=torch.bfloat16) +# model = model +while True: + query = input('What do you like to wear?\n') # "I like a dress with wide neckline" + print(f"QUERY:{query}") + # query_embeddings: np.ndarray: torchFloatTensor: dim x 1 + query_embeddings = torch.from_numpy(model.encode(query)) + query_embeddings = F.normalize(query_embeddings.reshape(len(query_embeddings), -1), p=2, dim=0) + + similarities = (document_embeddings @ query_embeddings).flatten() + + top_scores, top_k_indices = torch.topk(similarities, k) + top_k_indices = top_k_indices.cpu().numpy() + # Plot k images given user's query. + seen_set = set() + pos_set = dict() + neg_set = dict() + for i in top_k_indices: + # Text Preprocess + try: + image = document_ordered_names[i][:-2] + filename = filename_of[image] + if filename not in seen_set: + seen_set.add(filename) + img = np.asarray(Image.open(f"../images/{filename}")) + plt.imshow(img) + plt.show() + + ind = OWLNamedIndividual("http://example.org/" + image) + all_desc = list(reasoner.data_property_values(ind, has_description)) + selected_desc = all_desc[int(document_ordered_names[i][-1]) - 1] + print(selected_desc.get_literal()) + + feedback = input('Does this image contain something that fit your preferences? (y/n)\n') + if feedback == "y": + pos_set[image] = selected_desc.get_literal() + elif feedback == "n": + neg_set[image] = selected_desc.get_literal() + else: + print('Neutral selected') + + except KeyError: + print(f"{i} not found") + + print(f"Positive examples: {pos_set}") + print(f"Negative examples: {neg_set}") diff --git a/local_working_directory/single_question_fragmentation.py b/local_working_directory/single_question_fragmentation.py new file mode 100644 index 0000000..54255c7 --- /dev/null +++ b/local_working_directory/single_question_fragmentation.py @@ -0,0 +1,21 @@ +from openai import OpenAI + +client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") +print(client.chat.completions.create( + model="tentris", + messages=[ + { + "role": "user", + "content": + [ + { + "type": "text", + "text": "Can you separate each part of the following question into self-contained questions:" + "What are some clothing options that include a black leather jacket, a light blue denim shirt with a white collar, and black pants, suitable for a casual yet edgy style?" + }, + ] + } + ], + temperature=0.1, + seed=1 +).choices[0].message.content) diff --git a/local_working_directory/single_question_generation.py b/local_working_directory/single_question_generation.py new file mode 100644 index 0000000..af10ed4 --- /dev/null +++ b/local_working_directory/single_question_generation.py @@ -0,0 +1,70 @@ +import base64 + +import aiohttp +import asyncio +import time +from openai import OpenAI +from owlapy.iri import IRI +from owlapy.owl_ontology_manager import OntologyManager +from owlapy.owl_property import OWLDataProperty +from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner +from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF +from rdflib.namespace import XSD +from owlapy.owl_individual import OWLNamedIndividual + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + +manager = OntologyManager() +ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl")) +base_reasoner = OntologyReasoner(ontology) +reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) +dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName")) +dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription")) +dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription")) +image_iri_as_str = "http://example.org/image_25521" +image_ind = OWLNamedIndividual(image_iri_as_str) + +image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal()) +llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal()) +base64_image = encode_image(image_filename) +all_descriptions = "" + +# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system." +# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:" +# f"{all_descriptions}" +# f"{llm_description}" + +# "Only write the query which should be a question and always end with a questionmark." +for d in list(reasoner.data_property_values(image_ind,dprop2)): + all_descriptions = all_descriptions + d.get_literal() + "\n" + +client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") +print(client.chat.completions.create( + model="tentris", + messages=[ + { + "role": "user", + "content": + [ + { + "type": "text", + "text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system." + "Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:" + f"{all_descriptions}" + f"{llm_description}" + "Only write the query which should be a question and always end with a questionmark." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], + temperature=0.1, + seed=1 +).choices[0].message.content) diff --git a/local_working_directory/third_generation_async.py b/local_working_directory/third_generation_async.py new file mode 100644 index 0000000..46c0073 --- /dev/null +++ b/local_working_directory/third_generation_async.py @@ -0,0 +1,115 @@ +import base64 + +import aiohttp +import asyncio +import time +from openai import OpenAI +from owlapy.iri import IRI +from owlapy.owl_ontology_manager import OntologyManager +from owlapy.owl_property import OWLDataProperty +from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner +from rdflib import Graph, URIRef, RDF, OWL, RDFS, XSD + +api_key = "token-tentris-upb" +api_base = "http://tentris-ml.cs.upb.de:8501/v1" +client = OpenAI(api_key=api_key, base_url=api_base) + + +manager = OntologyManager() +ontology = manager.load_ontology(IRI.create("file://../fashionpedia-second-generation.owl")) +base_reasoner = OntologyReasoner(ontology) +reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) +g = Graph() +g.parse("../fashionpedia-second-generation.owl") + +has_llm_description = URIRef("http://example.org/hasLLMDescription") + +class_image = URIRef("http://example.org/Image") +g.add((has_llm_description, RDF.type, OWL.DatatypeProperty)) +g.add((has_llm_description, RDFS.domain, class_image)) +g.add((has_llm_description, RDFS.range, XSD.string)) + + +def tentris_ensemble_llm(): + completion = client.chat.completions.create( + model="tentris", + messages=[ + {"role": "system", "content": "You are a helpful assistant. Which LLM would you use to answer the user's question?" + "We cannot effort to use largest model for every questions."}, + {"role": "user", "content": "What is the capital of Germany"}], + extra_body={"guided_choice": ["Qwen2.5-0.5B-Instruct", + "Qwen2.5-1.5B-Instruct" + "Qwen2.5-3B-Instruct", + "Qwen2.5-7B-Instruct", + "Llama-3.1-3B-Instruct", + "Llama-3.2-3B-Instruct"]}) + return completion.choices[0].message.content + + +headers = { + 'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'} + + +# Asynchronous function to send a single request +async def send_async_command(payload, ind): + async with aiohttp.ClientSession() as session: + async with session.post(f'{api_base}/chat/completions', headers=headers, json=payload) as response: + if response.status == 200: + result = await response.json() + print(f"Result from {ind.str}: {result['content']}") + else: + print(f"Error in query about {ind.str}: {response.status}") + print(await response.text()) + + +# Define a function to create the payload for each query +def create_payload(base64_image): + return { + "model": "tentris", + "messages": [ + {"role": "user", + "content": [{ + "type": "text", + "text": "You are a fashion expert." + "Your task is to give a short description of the apparel shown in the attached image." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], "temperature": 0.1 + } + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +# Asynchronous function to send 10 queries concurrently +async def send_multiple_queries(): + tasks = [] + count = 0 + for i in ontology.individuals_in_signature(): + dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName")) + image = "../images/" + str(list(reasoner.data_property_values(i, dprop1))[0].get_literal()) + base64_image = encode_image(image) + payload = create_payload(base64_image) + tasks.append(send_async_command(payload, i)) + count += 1 + if count > 3: + break + + # Run all tasks concurrently + await asyncio.gather(*tasks) + + +# Run the async loop +if __name__ == "__main__": + start_time = time.time() + asyncio.run(send_multiple_queries()) + print(time.time() - start_time) diff --git a/local_working_directory/trying_LLM.py b/local_working_directory/trying_LLM.py new file mode 100644 index 0000000..5751074 --- /dev/null +++ b/local_working_directory/trying_LLM.py @@ -0,0 +1,37 @@ +import base64 +from openai import OpenAI +from owlapy.owl_individual import OWLNamedIndividual + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + +# Path to your image +image = "000b3a87508b0fa185fbd53ecbe2e4c6.jpg" +# Getting the base64 string +base64_image = encode_image(image) + +client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") +print(client.chat.completions.create( + model="tentris", + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "You are a fashion expert." + "Your task is to give a short description of the apparel provided in the attached image." + "You should focus only on the apparel presented to you. Don't describe the background." + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + }], + temperature=0.1, + seed=1 +).choices[0].message.content)