Skip to content

Commit

Permalink
added retrival related scripts and other experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
alkidbaci committed Oct 27, 2024
1 parent 7e0e60a commit 81de4cd
Show file tree
Hide file tree
Showing 12 changed files with 534 additions and 0 deletions.
70 changes: 70 additions & 0 deletions general_working_directory/question_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import base64

import aiohttp
import asyncio
import time
from openai import OpenAI
from owlapy.iri import IRI
from owlapy.owl_ontology_manager import OntologyManager
from owlapy.owl_property import OWLDataProperty
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF
from rdflib.namespace import XSD
from owlapy.owl_individual import OWLNamedIndividual

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

manager = OntologyManager()
ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
base_reasoner = OntologyReasoner(ontology)
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName"))
dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))
image_iri_as_str = "http://example.org/image_25521"
image_ind = OWLNamedIndividual(image_iri_as_str)

image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal())
llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
base64_image = encode_image(image_filename)
all_descriptions = ""

# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system."
# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:"
# f"{all_descriptions}"
# f"{llm_description}"

# "Only write the query which should be a question and always end with a questionmark."
for d in list(reasoner.data_property_values(image_ind,dprop2)):
all_descriptions = all_descriptions + d.get_literal() + "\n"

client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
print(client.chat.completions.create(
model="tentris",
messages=[
{
"role": "user",
"content":
[
{
"type": "text",
"text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system."
"Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:"
f"{all_descriptions}"
f"{llm_description}"
"Only write the query which should be a question and always end with a questionmark."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
temperature=0.1,
seed=1
).choices[0].message.content)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
66 changes: 66 additions & 0 deletions local_working_directory/bm25_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/)
"""

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import rdflib

class BM25(object):
def __init__(self, b=0.75, k1=1.6):
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
self.b = b
self.k1 = k1

def fit(self, X):
""" Fit IDF to documents X """
self.vectorizer.fit(X)
y = super(TfidfVectorizer, self.vectorizer).transform(X)
self.avdl = y.sum(1).mean()

def transform(self, q, X):
""" Calculate BM25 between query q and documents X """
b, k1, avdl = self.b, self.k1, self.avdl

# apply CountVectorizer
X = super(TfidfVectorizer, self.vectorizer).transform(X)
len_X = X.sum(1).A1
q, = super(TfidfVectorizer, self.vectorizer).transform([q])
assert sparse.isspmatrix_csr(q)

# convert to csc for better column slicing
X = X.tocsc()[:, q.indices]
denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
# idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
# to idf(t) = log [ n / df(t) ] with minus 1
idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
return (numer / denom).sum(1).A1



#------------ End of library impl. Followings are the example -----------------
from sklearn.datasets import fetch_20newsgroups
# documents = fetch_20newsgroups(subset='train').data
g = rdflib.Graph()
g.parse("fashionpedia-third-generation.owl", format="xml")

# Extract triplets
triplets = []
cn = 0
for subj, pred, obj in g:
triplets.append((str(subj), str(pred), str(obj)))
# cn += 1
# if cn > 10000:
# break
print(len(triplets))
# Index the data (convert triplets to text format)
documents = ["\n".join(triplet) for triplet in triplets]

bm25 = BM25()
bm25.fit(documents)
# Find the similar documents given query
query = "What are some clothes containing blue tshirt with long sleeves?"
scores = bm25.transform(query, documents)
print(documents[np.argmax(scores)])
40 changes: 40 additions & 0 deletions local_working_directory/bm25_retriever_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

os.environ["OPENAI_API_KEY"] = "token-tentris-upb"

from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.response.notebook_utils import display_source_node
import Stemmer

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

# load documents
documents = SimpleDirectoryReader("./data/text-from-kg").load_data()

# initialize node parser
splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)
# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
nodes=nodes,
similarity_top_k=2,
# Optional: We can pass in the stemmer and set the language for stopwords
# This is important for removing stopwords and stemming the query + text
# The default is english for both
stemmer=Stemmer.Stemmer("english"),
language="english",
)

retrieved_nodes = bm25_retriever.retrieve(
"What are some clothing options that include a black leather jacket with a zip-up front, above-the-hip length, asymmetrical silhouette, and biker style?"
)
for node in retrieved_nodes:
print(node.text)


29 changes: 29 additions & 0 deletions local_working_directory/bm25_retriever_kg_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os

os.environ["OPENAI_API_KEY"] = "token-tentris-upb"
import rdflib
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import Settings, Document
from llama_index.core.storage.docstore.simple_docstore import SimpleDocumentStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
import Stemmer


Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

g = rdflib.Graph()
g.parse("fashionpedia-second-generation.owl", format="xml")

# Extract triplets
triplets = []
for subj, pred, obj in g:
triplets.append((str(subj), str(pred), str(obj)))

# Index the data (convert triplets to text format)
text_data = ["\n".join(triplet) for triplet in triplets]

with open('fashionpedia-second-generation.txt', 'w') as file:
file.writelines(text_data)
86 changes: 86 additions & 0 deletions local_working_directory/recommendation_script1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from owlapy.iri import IRI
from owlapy.owl_individual import OWLNamedIndividual
from owlapy.owl_ontology_manager import OntologyManager
from owlapy.owl_property import OWLDataProperty
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
from transformers import AutoModel
import json
import torch
import pandas as pd
import torch.nn.functional as F
import polars as pl

k = 3
# Set to None if you want to read all
nrows = None

with open('../image-filename-mappings.json', 'r') as file:
filename_of = json.load(file)

manager = OntologyManager()
ontology = manager.load_ontology(IRI.create("file://fashionpedia-second-generation.owl"))
base_reasoner = OntologyReasoner(ontology)
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription"))


# (1) Load the csv file fashionpedia-embeddings.csv".
print("Reading embeddings", end="\t")
df = pd.read_csv("../fashionpedia-embeddings.csv", index_col=0, nrows=nrows)
print(df.shape)
# (2) D a matrix each row represents an embedding vector
document_embeddings = F.normalize(torch.from_numpy(df.values).type(torch.float32), p=2, dim=1)
# (3)
document_ordered_names = df.index.values.tolist()
# (4) Initialize the embedder
print("Loading embedding model", end="\t")
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
torch_dtype=torch.bfloat16)
# model = model
while True:
query = input('What do you like to wear?\n') # "I like a dress with wide neckline"
print(f"QUERY:{query}")
# query_embeddings: np.ndarray: torchFloatTensor: dim x 1
query_embeddings = torch.from_numpy(model.encode(query))
query_embeddings = F.normalize(query_embeddings.reshape(len(query_embeddings), -1), p=2, dim=0)

similarities = (document_embeddings @ query_embeddings).flatten()

top_scores, top_k_indices = torch.topk(similarities, k)
top_k_indices = top_k_indices.cpu().numpy()
# Plot k images given user's query.
seen_set = set()
pos_set = dict()
neg_set = dict()
for i in top_k_indices:
# Text Preprocess
try:
image = document_ordered_names[i][:-2]
filename = filename_of[image]
if filename not in seen_set:
seen_set.add(filename)
img = np.asarray(Image.open(f"../images/{filename}"))
plt.imshow(img)
plt.show()

ind = OWLNamedIndividual("http://example.org/" + image)
all_desc = list(reasoner.data_property_values(ind, has_description))
selected_desc = all_desc[int(document_ordered_names[i][-1]) - 1]
print(selected_desc.get_literal())

feedback = input('Does this image contain something that fit your preferences? (y/n)\n')
if feedback == "y":
pos_set[image] = selected_desc.get_literal()
elif feedback == "n":
neg_set[image] = selected_desc.get_literal()
else:
print('Neutral selected')

except KeyError:
print(f"{i} not found")

print(f"Positive examples: {pos_set}")
print(f"Negative examples: {neg_set}")
21 changes: 21 additions & 0 deletions local_working_directory/single_question_fragmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from openai import OpenAI

client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
print(client.chat.completions.create(
model="tentris",
messages=[
{
"role": "user",
"content":
[
{
"type": "text",
"text": "Can you separate each part of the following question into self-contained questions:"
"What are some clothing options that include a black leather jacket, a light blue denim shirt with a white collar, and black pants, suitable for a casual yet edgy style?"
},
]
}
],
temperature=0.1,
seed=1
).choices[0].message.content)
70 changes: 70 additions & 0 deletions local_working_directory/single_question_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import base64

import aiohttp
import asyncio
import time
from openai import OpenAI
from owlapy.iri import IRI
from owlapy.owl_ontology_manager import OntologyManager
from owlapy.owl_property import OWLDataProperty
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF
from rdflib.namespace import XSD
from owlapy.owl_individual import OWLNamedIndividual

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

manager = OntologyManager()
ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
base_reasoner = OntologyReasoner(ontology)
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName"))
dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))
image_iri_as_str = "http://example.org/image_25521"
image_ind = OWLNamedIndividual(image_iri_as_str)

image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal())
llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
base64_image = encode_image(image_filename)
all_descriptions = ""

# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system."
# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:"
# f"{all_descriptions}"
# f"{llm_description}"

# "Only write the query which should be a question and always end with a questionmark."
for d in list(reasoner.data_property_values(image_ind,dprop2)):
all_descriptions = all_descriptions + d.get_literal() + "\n"

client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb")
print(client.chat.completions.create(
model="tentris",
messages=[
{
"role": "user",
"content":
[
{
"type": "text",
"text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system."
"Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:"
f"{all_descriptions}"
f"{llm_description}"
"Only write the query which should be a question and always end with a questionmark."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
temperature=0.1,
seed=1
).choices[0].message.content)
Loading

0 comments on commit 81de4cd

Please sign in to comment.