-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added retrival related scripts and other experiments
- Loading branch information
Showing
12 changed files
with
534 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import base64 | ||
|
||
import aiohttp | ||
import asyncio | ||
import time | ||
from openai import OpenAI | ||
from owlapy.iri import IRI | ||
from owlapy.owl_ontology_manager import OntologyManager | ||
from owlapy.owl_property import OWLDataProperty | ||
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner | ||
from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF | ||
from rdflib.namespace import XSD | ||
from owlapy.owl_individual import OWLNamedIndividual | ||
|
||
def encode_image(image_path): | ||
with open(image_path, "rb") as image_file: | ||
return base64.b64encode(image_file.read()).decode('utf-8') | ||
|
||
manager = OntologyManager() | ||
ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl")) | ||
base_reasoner = OntologyReasoner(ontology) | ||
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) | ||
dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName")) | ||
dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription")) | ||
dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription")) | ||
image_iri_as_str = "http://example.org/image_25521" | ||
image_ind = OWLNamedIndividual(image_iri_as_str) | ||
|
||
image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal()) | ||
llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal()) | ||
base64_image = encode_image(image_filename) | ||
all_descriptions = "" | ||
|
||
# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system." | ||
# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:" | ||
# f"{all_descriptions}" | ||
# f"{llm_description}" | ||
|
||
# "Only write the query which should be a question and always end with a questionmark." | ||
for d in list(reasoner.data_property_values(image_ind,dprop2)): | ||
all_descriptions = all_descriptions + d.get_literal() + "\n" | ||
|
||
client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") | ||
print(client.chat.completions.create( | ||
model="tentris", | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": | ||
[ | ||
{ | ||
"type": "text", | ||
"text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system." | ||
"Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:" | ||
f"{all_descriptions}" | ||
f"{llm_description}" | ||
"Only write the query which should be a question and always end with a questionmark." | ||
}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": f"data:image/jpeg;base64,{base64_image}" | ||
} | ||
} | ||
] | ||
} | ||
], | ||
temperature=0.1, | ||
seed=1 | ||
).choices[0].message.content) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer | ||
Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/) | ||
""" | ||
|
||
import numpy as np | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from scipy import sparse | ||
import rdflib | ||
|
||
class BM25(object): | ||
def __init__(self, b=0.75, k1=1.6): | ||
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False) | ||
self.b = b | ||
self.k1 = k1 | ||
|
||
def fit(self, X): | ||
""" Fit IDF to documents X """ | ||
self.vectorizer.fit(X) | ||
y = super(TfidfVectorizer, self.vectorizer).transform(X) | ||
self.avdl = y.sum(1).mean() | ||
|
||
def transform(self, q, X): | ||
""" Calculate BM25 between query q and documents X """ | ||
b, k1, avdl = self.b, self.k1, self.avdl | ||
|
||
# apply CountVectorizer | ||
X = super(TfidfVectorizer, self.vectorizer).transform(X) | ||
len_X = X.sum(1).A1 | ||
q, = super(TfidfVectorizer, self.vectorizer).transform([q]) | ||
assert sparse.isspmatrix_csr(q) | ||
|
||
# convert to csc for better column slicing | ||
X = X.tocsc()[:, q.indices] | ||
denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None] | ||
# idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted | ||
# to idf(t) = log [ n / df(t) ] with minus 1 | ||
idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1. | ||
numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1) | ||
return (numer / denom).sum(1).A1 | ||
|
||
|
||
|
||
#------------ End of library impl. Followings are the example ----------------- | ||
from sklearn.datasets import fetch_20newsgroups | ||
# documents = fetch_20newsgroups(subset='train').data | ||
g = rdflib.Graph() | ||
g.parse("fashionpedia-third-generation.owl", format="xml") | ||
|
||
# Extract triplets | ||
triplets = [] | ||
cn = 0 | ||
for subj, pred, obj in g: | ||
triplets.append((str(subj), str(pred), str(obj))) | ||
# cn += 1 | ||
# if cn > 10000: | ||
# break | ||
print(len(triplets)) | ||
# Index the data (convert triplets to text format) | ||
documents = ["\n".join(triplet) for triplet in triplets] | ||
|
||
bm25 = BM25() | ||
bm25.fit(documents) | ||
# Find the similar documents given query | ||
query = "What are some clothes containing blue tshirt with long sleeves?" | ||
scores = bm25.transform(query, documents) | ||
print(documents[np.argmax(scores)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import os | ||
|
||
os.environ["OPENAI_API_KEY"] = "token-tentris-upb" | ||
|
||
from llama_index.core import Settings | ||
from llama_index.llms.openai import OpenAI | ||
from llama_index.embeddings.openai import OpenAIEmbedding | ||
from llama_index.core.node_parser import SentenceSplitter | ||
from llama_index.core import SimpleDirectoryReader | ||
from llama_index.retrievers.bm25 import BM25Retriever | ||
from llama_index.core.response.notebook_utils import display_source_node | ||
import Stemmer | ||
|
||
Settings.llm = OpenAI(model="gpt-3.5-turbo") | ||
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") | ||
|
||
# load documents | ||
documents = SimpleDirectoryReader("./data/text-from-kg").load_data() | ||
|
||
# initialize node parser | ||
splitter = SentenceSplitter(chunk_size=512) | ||
nodes = splitter.get_nodes_from_documents(documents) | ||
# We can pass in the index, docstore, or list of nodes to create the retriever | ||
bm25_retriever = BM25Retriever.from_defaults( | ||
nodes=nodes, | ||
similarity_top_k=2, | ||
# Optional: We can pass in the stemmer and set the language for stopwords | ||
# This is important for removing stopwords and stemming the query + text | ||
# The default is english for both | ||
stemmer=Stemmer.Stemmer("english"), | ||
language="english", | ||
) | ||
|
||
retrieved_nodes = bm25_retriever.retrieve( | ||
"What are some clothing options that include a black leather jacket with a zip-up front, above-the-hip length, asymmetrical silhouette, and biker style?" | ||
) | ||
for node in retrieved_nodes: | ||
print(node.text) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import os | ||
|
||
os.environ["OPENAI_API_KEY"] = "token-tentris-upb" | ||
import rdflib | ||
from llama_index.retrievers.bm25 import BM25Retriever | ||
from llama_index.core import Settings, Document | ||
from llama_index.core.storage.docstore.simple_docstore import SimpleDocumentStore | ||
from llama_index.llms.openai import OpenAI | ||
from llama_index.embeddings.openai import OpenAIEmbedding | ||
from llama_index.core.node_parser import SentenceSplitter | ||
import Stemmer | ||
|
||
|
||
Settings.llm = OpenAI(model="gpt-3.5-turbo") | ||
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") | ||
|
||
g = rdflib.Graph() | ||
g.parse("fashionpedia-second-generation.owl", format="xml") | ||
|
||
# Extract triplets | ||
triplets = [] | ||
for subj, pred, obj in g: | ||
triplets.append((str(subj), str(pred), str(obj))) | ||
|
||
# Index the data (convert triplets to text format) | ||
text_data = ["\n".join(triplet) for triplet in triplets] | ||
|
||
with open('fashionpedia-second-generation.txt', 'w') as file: | ||
file.writelines(text_data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from PIL import Image | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from owlapy.iri import IRI | ||
from owlapy.owl_individual import OWLNamedIndividual | ||
from owlapy.owl_ontology_manager import OntologyManager | ||
from owlapy.owl_property import OWLDataProperty | ||
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner | ||
from transformers import AutoModel | ||
import json | ||
import torch | ||
import pandas as pd | ||
import torch.nn.functional as F | ||
import polars as pl | ||
|
||
k = 3 | ||
# Set to None if you want to read all | ||
nrows = None | ||
|
||
with open('../image-filename-mappings.json', 'r') as file: | ||
filename_of = json.load(file) | ||
|
||
manager = OntologyManager() | ||
ontology = manager.load_ontology(IRI.create("file://fashionpedia-second-generation.owl")) | ||
base_reasoner = OntologyReasoner(ontology) | ||
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) | ||
has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription")) | ||
|
||
|
||
# (1) Load the csv file fashionpedia-embeddings.csv". | ||
print("Reading embeddings", end="\t") | ||
df = pd.read_csv("../fashionpedia-embeddings.csv", index_col=0, nrows=nrows) | ||
print(df.shape) | ||
# (2) D a matrix each row represents an embedding vector | ||
document_embeddings = F.normalize(torch.from_numpy(df.values).type(torch.float32), p=2, dim=1) | ||
# (3) | ||
document_ordered_names = df.index.values.tolist() | ||
# (4) Initialize the embedder | ||
print("Loading embedding model", end="\t") | ||
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, | ||
torch_dtype=torch.bfloat16) | ||
# model = model | ||
while True: | ||
query = input('What do you like to wear?\n') # "I like a dress with wide neckline" | ||
print(f"QUERY:{query}") | ||
# query_embeddings: np.ndarray: torchFloatTensor: dim x 1 | ||
query_embeddings = torch.from_numpy(model.encode(query)) | ||
query_embeddings = F.normalize(query_embeddings.reshape(len(query_embeddings), -1), p=2, dim=0) | ||
|
||
similarities = (document_embeddings @ query_embeddings).flatten() | ||
|
||
top_scores, top_k_indices = torch.topk(similarities, k) | ||
top_k_indices = top_k_indices.cpu().numpy() | ||
# Plot k images given user's query. | ||
seen_set = set() | ||
pos_set = dict() | ||
neg_set = dict() | ||
for i in top_k_indices: | ||
# Text Preprocess | ||
try: | ||
image = document_ordered_names[i][:-2] | ||
filename = filename_of[image] | ||
if filename not in seen_set: | ||
seen_set.add(filename) | ||
img = np.asarray(Image.open(f"../images/{filename}")) | ||
plt.imshow(img) | ||
plt.show() | ||
|
||
ind = OWLNamedIndividual("http://example.org/" + image) | ||
all_desc = list(reasoner.data_property_values(ind, has_description)) | ||
selected_desc = all_desc[int(document_ordered_names[i][-1]) - 1] | ||
print(selected_desc.get_literal()) | ||
|
||
feedback = input('Does this image contain something that fit your preferences? (y/n)\n') | ||
if feedback == "y": | ||
pos_set[image] = selected_desc.get_literal() | ||
elif feedback == "n": | ||
neg_set[image] = selected_desc.get_literal() | ||
else: | ||
print('Neutral selected') | ||
|
||
except KeyError: | ||
print(f"{i} not found") | ||
|
||
print(f"Positive examples: {pos_set}") | ||
print(f"Negative examples: {neg_set}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from openai import OpenAI | ||
|
||
client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") | ||
print(client.chat.completions.create( | ||
model="tentris", | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": | ||
[ | ||
{ | ||
"type": "text", | ||
"text": "Can you separate each part of the following question into self-contained questions:" | ||
"What are some clothing options that include a black leather jacket, a light blue denim shirt with a white collar, and black pants, suitable for a casual yet edgy style?" | ||
}, | ||
] | ||
} | ||
], | ||
temperature=0.1, | ||
seed=1 | ||
).choices[0].message.content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import base64 | ||
|
||
import aiohttp | ||
import asyncio | ||
import time | ||
from openai import OpenAI | ||
from owlapy.iri import IRI | ||
from owlapy.owl_ontology_manager import OntologyManager | ||
from owlapy.owl_property import OWLDataProperty | ||
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner | ||
from rdflib import Graph, URIRef, Literal, BNode, RDFS, OWL, Namespace, RDF | ||
from rdflib.namespace import XSD | ||
from owlapy.owl_individual import OWLNamedIndividual | ||
|
||
def encode_image(image_path): | ||
with open(image_path, "rb") as image_file: | ||
return base64.b64encode(image_file.read()).decode('utf-8') | ||
|
||
manager = OntologyManager() | ||
ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl")) | ||
base_reasoner = OntologyReasoner(ontology) | ||
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology) | ||
dprop1 = OWLDataProperty(IRI.create("http://example.org/hasFileName")) | ||
dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription")) | ||
dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription")) | ||
image_iri_as_str = "http://example.org/image_25521" | ||
image_ind = OWLNamedIndividual(image_iri_as_str) | ||
|
||
image_filename = "images/" + str(list(reasoner.data_property_values(image_ind, dprop1))[0].get_literal()) | ||
llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal()) | ||
base64_image = encode_image(image_filename) | ||
all_descriptions = "" | ||
|
||
# "Consider you are a user that is looking for clothes and other apparels in an online recommandation system." | ||
# "Formulate a query of a prompt-like structure that the user would use in such a way that the attached image would be returned. To generate the query you can take in consideration the following auxiliary information about the image:" | ||
# f"{all_descriptions}" | ||
# f"{llm_description}" | ||
|
||
# "Only write the query which should be a question and always end with a questionmark." | ||
for d in list(reasoner.data_property_values(image_ind,dprop2)): | ||
all_descriptions = all_descriptions + d.get_literal() + "\n" | ||
|
||
client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8501/v1", api_key="token-tentris-upb") | ||
print(client.chat.completions.create( | ||
model="tentris", | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": | ||
[ | ||
{ | ||
"type": "text", | ||
"text": "Consider you are a user that is looking for clothes/apparels in an online recommandation system." | ||
"Formulate a query of a prompt-like structure that the you would ask in such a way that the attached image would be recommended to you. To generate the query you can take in consideration the following auxiliary information about the image:" | ||
f"{all_descriptions}" | ||
f"{llm_description}" | ||
"Only write the query which should be a question and always end with a questionmark." | ||
}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": f"data:image/jpeg;base64,{base64_image}" | ||
} | ||
} | ||
] | ||
} | ||
], | ||
temperature=0.1, | ||
seed=1 | ||
).choices[0].message.content) |
Oops, something went wrong.