Skip to content

Commit

Permalink
restructuring of files
Browse files Browse the repository at this point in the history
  • Loading branch information
alkidbaci committed Nov 14, 2024
1 parent 0c361ef commit f25d35f
Show file tree
Hide file tree
Showing 14 changed files with 276 additions and 43 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
72 changes: 72 additions & 0 deletions archives/recommendation_script_old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import AutoModel
from numpy.linalg import norm
import json


def sort(d):
# Sort the dictionary by value in descending order
sorted_dict = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
return sorted_dict

intent = input('What do you like to wear?\n') # "I like a dress with wide neckline"

cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
torch_dtype=torch.bfloat16)

with open("fashionpedia-embeddings.json", 'r') as f:
data = json.load(f)

with open('../image-filename-mappings.json', 'r') as file:
filename_of = json.load(file)

image_cos = {}
top_cos_sin_value = 0
top_image = ""
for image in data:
embeddings = model.encode(intent)
embeddings2 = numpy.array(data.get(image))
v = cos_sim(embeddings, embeddings2)
image_cos[image] = v
if v > top_cos_sin_value:
top_cos_sin_value = v
top_image = image

print(sort(image_cos))
# print(f"Top Image: {top_image} with Cosine Similarity: {top_cos_sin_value}")
# top_image_filename = filename_of[top_image[:-2]]
#
# img = np.asarray(Image.open(f"images/{top_image_filename}"))
# plt.imshow(img)
# plt.show()

seen_set = set()
pos_set = set()
neg_set = set()
for image in sort(image_cos).keys():
filename = filename_of[image[:-2]]
if filename not in seen_set:
seen_set.add(filename)
img = np.asarray(Image.open(f"images/{filename}"))
plt.imshow(img)
plt.show()
feedback = input('Does this image contain something that fit your preferences? (y/n)\n')
if feedback == "y":
pos_set.add(image[:-2])
elif feedback == "n":
neg_set.add(image[:-2])
else:
print('Neutral selected')
if len(seen_set) == 4:
break

with open("lp.json", 'w') as f:
json.dump({"pos": pos_set, "neg": neg_set}, f)

# print(f"Positive examples: {pos_set}")
# print(f"Negative examples: {neg_set}")
42 changes: 21 additions & 21 deletions general_working_directory/embeddings-generation.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
import csv
from openai import OpenAI
from owlapy.iri import IRI
from owlapy.owl_ontology_manager import OntologyManager
from owlapy.owl_property import OWLDataProperty
from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
import torch
from transformers import AutoModel
from numpy.linalg import norm
import json

manager = OntologyManager()
ontology = manager.load_ontology(IRI.create("file://../fashionpedia-second-generation.owl"))
ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
base_reasoner = OntologyReasoner(ontology)
reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))

cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
torch_dtype=torch.bfloat16)

embeddings_final = {}
for image in ontology.individuals_in_signature():
descriptions = list(reasoner.data_property_values(image, has_description))
desc_counter = 1
for description in descriptions:
embeddings = model.encode(description.get_literal())
embeddings_final[image.str.split("/")[-1] + f"_{desc_counter}"] = embeddings.tolist()
desc_counter += 1

with open("../fashionpedia-embeddingsss", 'w') as f:
json.dump(embeddings_final, f)
with open('output.csv', mode='a', newline='') as file:
writer = csv.writer(file)
client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8502/v1", api_key="token-tentris-upb")
count = 0
for image_ind in ontology.individuals_in_signature():
llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
if len(llm_description) > 4000:
llm_description = llm_description[:4000]
all_descriptions = ""
for d in list(reasoner.data_property_values(image_ind, dprop2)):
all_descriptions = all_descriptions + d.get_literal() + "\n"
image_iri = image_ind.str
responses = client.embeddings.create(input=[all_descriptions + "\n " + llm_description], model="tentris")
writer.writerow([image_iri, responses.data[0].embedding])
count += 1
print(f"{image_iri}: {count:,}/45,623")
73 changes: 73 additions & 0 deletions local_working_directory/bm25_fast_score_placement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import rdflib


class BM25(object):
def __init__(self, b=0.75, k1=1.6):
self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
self.b = b
self.k1 = k1

def fit(self, X):
""" Fit IDF to documents X """
self.vectorizer.fit(X)
y = super(TfidfVectorizer, self.vectorizer).transform(X)
self.avdl = y.sum(1).mean()

def transform(self, q, X):
""" Calculate BM25 between query q and documents X """
b, k1, avdl = self.b, self.k1, self.avdl

# apply CountVectorizer
X = super(TfidfVectorizer, self.vectorizer).transform(X)
len_X = X.sum(1).A1
q, = super(TfidfVectorizer, self.vectorizer).transform([q])
assert sparse.isspmatrix_csr(q)

# convert to csc for better column slicing
X = X.tocsc()[:, q.indices]
denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
# idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
# to idf(t) = log [ n / df(t) ] with minus 1
idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
return (numer / denom).sum(1).A1


# ------------ End of library impl. Followings are the example -----------------
from sklearn.datasets import fetch_20newsgroups

# documents = fetch_20newsgroups(subset='train').data
g = rdflib.Graph()
g.parse("fashionpedia-third-generation.owl", format="xml")

# Extract triplets
documents = []
for subject in g.subjects():
txt = f"{str(subject)} \n"
# exclude uneccesary information
for predicate, obj in g.predicate_objects(subject):
txt += f"{obj} \n"
documents.append(txt)

bm25 = BM25()
bm25.fit(documents)
# Find the similar documents given query
query = "What elegant and traditional bridal gowns are available?"
target_iri = "http://example.org/image_47178"
scores = bm25.transform(query, documents)
storage = dict()
for i, val in enumerate(documents):
iri = val.split()[0]
storage[iri] = scores[i]

storage_sorted = {k: v for k, v in sorted(storage.items(), key=lambda item: item[1], reverse=True)}

placement = list(storage_sorted.keys()).index(target_iri)

print(f"target_iri placement: {placement} \n target_iri score: {storage_sorted[target_iri]}")
print(f" Top scorer: {storage_sorted.items()}")

# print(documents[np.argmax(scores)])
25 changes: 25 additions & 0 deletions local_working_directory/embedding-retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from openai import OpenAI
import pandas as pd
import numpy as np
from numpy.linalg import norm

query = 'I like a dress with wide neckline'

df = pd.read_csv("embeddings_short2.csv", index_col=0, nrows=None)
iris = df.index.values.tolist()

client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8502/v1", api_key="token-tentris-upb")

docs = np.array(df.values)
qr = np.array(client.embeddings.create(input=[query], model="tentris").data[0].embedding)

docs_norms = docs / norm(docs, axis=1, keepdims=True)
qr_norms = qr / norm(qr)

cosine_similarities = (docs_norms @ qr_norms).flatten()

best_match_index = np.argmax(cosine_similarities)
best_similarity = cosine_similarities[best_match_index]

print(cosine_similarities)
print(f"The best scoring image is the image with iri: {iris[best_match_index]} and score: {best_similarity}")
3 changes: 3 additions & 0 deletions local_working_directory/embeddings_short2.csv

Large diffs are not rendered by default.

20 changes: 0 additions & 20 deletions local_working_directory/fashionpedia-embeddings.csv

This file was deleted.

1 change: 0 additions & 1 deletion local_working_directory/fashionpedia-embeddings.json

This file was deleted.

1 change: 0 additions & 1 deletion local_working_directory/image_18321_embeddings

This file was deleted.

82 changes: 82 additions & 0 deletions local_working_directory/recommendation_script2_first_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
from PIL import Image

import matplotlib.pyplot as plt
import numpy as np
from owlapy.class_expression import OWLObjectHasValue
from owlapy.iri import IRI
from owlapy.owl_individual import OWLNamedIndividual
from ontolearn.knowledge_base import KnowledgeBase
from ontolearn.concept_learner import EvoLearner, CELOE
from ontolearn.learning_problem import PosNegLPStandard
from owlapy.owl_property import OWLObjectProperty, OWLDataProperty

kb = KnowledgeBase(path="../fashionpedia-first-generation.owl")


def add_namespace(ind):
return "http://example.org/" + ind


pos = {'image_21007': "Contains dress from supercategory 'wholebody' with the following attributes: a-line of supercategory 'silhouette', mini (length) of supercategory 'length'", 'image_45433': "Contains dress from supercategory 'wholebody' with the following attributes: normal waist of supercategory 'waistline', gown of supercategory 'nickname', trumpet of supercategory 'silhouette', asymmetrical of supercategory 'silhouette', lining of supercategory 'textile finishing, manufacturing techniques', maxi (length) of supercategory 'length', plastic of supercategory 'non-textile material type', zip-up of supercategory 'opening type'", 'image_19012': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_42863': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_8113': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_26892': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_32017': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_42632': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'"}

neg = {'image_34250': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_20344': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'"}

with open('../image-filename-mappings.json', 'r') as file:
filename_of = json.load(file)
oprop = OWLObjectProperty(IRI.create("http://example.org/hasImage"))
oprop2 = OWLObjectProperty(IRI.create("http://example.org/hasCategory"))
oprop3 = OWLObjectProperty(IRI.create("http://example.org/hasAttribute"))
dprop4 = OWLDataProperty(IRI.create("http://example.org/hasName"))
dprop5 = OWLDataProperty(IRI.create("http://example.org/hasSupercategory"))


typed_pos = set()
typed_neg = set()

for i in set(pos.keys()).union(set(neg.keys())):
ind = OWLNamedIndividual(add_namespace(i))
annotations = kb.individuals(OWLObjectHasValue(oprop, ind))
final_description = ""
for annotation in annotations:
cat_ind = list(kb.get_object_property_values(annotation, oprop2))[0]
cat_name = list(kb.get_data_property_values(cat_ind, dprop4))[0].get_literal()
cat_supercat = list(kb.get_data_property_values(cat_ind, dprop5))[0].get_literal()
description = f"Contains {cat_name} from supercategory '{cat_supercat}'"
attrs = list(kb.get_object_property_values(annotation, oprop3))
if len(attrs) > 0:
description += " with the following attributes: "
for attr in attrs:
attr_name = list(kb.get_data_property_values(attr, dprop4))[0].get_literal()
attr_supercat = list(kb.get_data_property_values(attr, dprop5))[0].get_literal()
description += f"{attr_name} of supercategory '{attr_supercat}', "
final_description = description[:-2]
else:
final_description = description

if i in pos.keys() and final_description in pos[i]:
typed_pos.add(annotation)
elif i in neg.keys() and final_description in neg[i]:
typed_neg.add(annotation)

print(typed_pos)
print(typed_neg)
lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)

model = CELOE(knowledge_base=kb, max_runtime=120)
model.fit(lp, verbose=False)

hypotheses = model.best_hypotheses(n=1)
print(hypotheses)
for ind in list(kb.individuals(hypotheses))[:10]:
print(ind)
filename = filename_of[ind.iri.reminder]
img = np.asarray(Image.open(f'../images/{filename}'))
plt.imshow(img)
plt.show()
try:
print(":gdf")
except Exception as e:
print(e)
with open("questions.json", "w") as outfile:
json.dump({"dsad": "dasd"}, outfile)

0 comments on commit f25d35f

Please sign in to comment.