restructuring of files

dice-group · Nov 14, 2024 · f25d35f · f25d35f
1 parent 0c361ef
commit f25d35f
Show file tree

Hide file tree

Showing 14 changed files with 276 additions and 43 deletions.
diff --git a/general_working_directory/convert_to_csv.py → archives/convert_to_csv.py b/general_working_directory/convert_to_csv.py → archives/convert_to_csv.py
diff --git a/...rking_directory/recommendation_script1.py → archives/embedding-retriever_old.py b/...rking_directory/recommendation_script1.py → archives/embedding-retriever_old.py
diff --git a/...ing_directory/image_to_filename_mapper.py → archives/image_to_filename_mapper.py b/...ing_directory/image_to_filename_mapper.py → archives/image_to_filename_mapper.py
diff --git a/...ctory/local-test-embeddings-generation.py → archives/local-test-embeddings-generation.py b/...ctory/local-test-embeddings-generation.py → archives/local-test-embeddings-generation.py
diff --git a/...orking_directory/recommendation_script.py → archives/recommendation_script.py b/...orking_directory/recommendation_script.py → archives/recommendation_script.py
diff --git a/archives/recommendation_script_old.py b/archives/recommendation_script_old.py
@@ -0,0 +1,72 @@
+import numpy
+from PIL import Image
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from transformers import AutoModel
+from numpy.linalg import norm
+import json
+
+
+def sort(d):
+    # Sort the dictionary by value in descending order
+    sorted_dict = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))
+    return sorted_dict
+
+intent = input('What do you like to wear?\n')  # "I like a dress with wide neckline"
+
+cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
+model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
+                                  torch_dtype=torch.bfloat16)
+
+with open("fashionpedia-embeddings.json", 'r') as f:
+    data = json.load(f)
+
+with open('../image-filename-mappings.json', 'r') as file:
+    filename_of = json.load(file)
+
+image_cos = {}
+top_cos_sin_value = 0
+top_image = ""
+for image in data:
+    embeddings = model.encode(intent)
+    embeddings2 = numpy.array(data.get(image))
+    v = cos_sim(embeddings, embeddings2)
+    image_cos[image] = v
+    if v > top_cos_sin_value:
+        top_cos_sin_value = v
+        top_image = image
+
+print(sort(image_cos))
+# print(f"Top Image: {top_image} with Cosine Similarity: {top_cos_sin_value}")
+# top_image_filename = filename_of[top_image[:-2]]
+#
+# img = np.asarray(Image.open(f"images/{top_image_filename}"))
+# plt.imshow(img)
+# plt.show()
+
+seen_set = set()
+pos_set = set()
+neg_set = set()
+for image in sort(image_cos).keys():
+    filename = filename_of[image[:-2]]
+    if filename not in seen_set:
+        seen_set.add(filename)
+        img = np.asarray(Image.open(f"images/{filename}"))
+        plt.imshow(img)
+        plt.show()
+        feedback = input('Does this image contain something that fit your preferences? (y/n)\n')
+        if feedback == "y":
+            pos_set.add(image[:-2])
+        elif feedback == "n":
+            neg_set.add(image[:-2])
+        else:
+            print('Neutral selected')
+    if len(seen_set) == 4:
+        break
+
+with open("lp.json", 'w') as f:
+    json.dump({"pos": pos_set, "neg": neg_set}, f)
+
+# print(f"Positive examples: {pos_set}")
+# print(f"Negative examples: {neg_set}")
diff --git a/general_working_directory/embeddings-generation.py b/general_working_directory/embeddings-generation.py
@@ -1,30 +1,30 @@
+import csv
+from openai import OpenAI
 from owlapy.iri import IRI
 from owlapy.owl_ontology_manager import OntologyManager
 from owlapy.owl_property import OWLDataProperty
 from owlapy.owl_reasoner import OntologyReasoner, FastInstanceCheckerReasoner
-import torch
-from transformers import AutoModel
-from numpy.linalg import norm
-import json
 
 manager = OntologyManager()
-ontology = manager.load_ontology(IRI.create("file://../fashionpedia-second-generation.owl"))
+ontology = manager.load_ontology(IRI.create("file://fashionpedia-third-generation.owl"))
 base_reasoner = OntologyReasoner(ontology)
 reasoner = FastInstanceCheckerReasoner(base_reasoner=base_reasoner, ontology=ontology)
-has_description = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
+dprop2 = OWLDataProperty(IRI.create("http://example.org/hasDescription"))
+dprop3 = OWLDataProperty(IRI.create("http://example.org/hasLLMDescription"))
 
-cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
-model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True,
-                                  torch_dtype=torch.bfloat16)
-
-embeddings_final = {}
-for image in ontology.individuals_in_signature():
-    descriptions = list(reasoner.data_property_values(image, has_description))
-    desc_counter = 1
-    for description in descriptions:
-        embeddings = model.encode(description.get_literal())
-        embeddings_final[image.str.split("/")[-1] + f"_{desc_counter}"] = embeddings.tolist()
-        desc_counter += 1
-
-with open("../fashionpedia-embeddingsss", 'w') as f:
-    json.dump(embeddings_final, f)
+with open('output.csv', mode='a', newline='') as file:
+    writer = csv.writer(file)
+    client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8502/v1", api_key="token-tentris-upb")
+    count = 0
+    for image_ind in ontology.individuals_in_signature():
+        llm_description = str(list(reasoner.data_property_values(image_ind, dprop3))[0].get_literal())
+        if len(llm_description) > 4000:
+            llm_description = llm_description[:4000]
+        all_descriptions = ""
+        for d in list(reasoner.data_property_values(image_ind, dprop2)):
+            all_descriptions = all_descriptions + d.get_literal() + "\n"
+        image_iri = image_ind.str
+        responses = client.embeddings.create(input=[all_descriptions + "\n " + llm_description], model="tentris")
+        writer.writerow([image_iri, responses.data[0].embedding])
+        count += 1
+        print(f"{image_iri}: {count:,}/45,623")
diff --git a/local_working_directory/bm25_fast_score_placement.py b/local_working_directory/bm25_fast_score_placement.py
@@ -0,0 +1,73 @@
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy import sparse
+import rdflib
+
+
+class BM25(object):
+    def __init__(self, b=0.75, k1=1.6):
+        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
+        self.b = b
+        self.k1 = k1
+
+    def fit(self, X):
+        """ Fit IDF to documents X """
+        self.vectorizer.fit(X)
+        y = super(TfidfVectorizer, self.vectorizer).transform(X)
+        self.avdl = y.sum(1).mean()
+
+    def transform(self, q, X):
+        """ Calculate BM25 between query q and documents X """
+        b, k1, avdl = self.b, self.k1, self.avdl
+
+        # apply CountVectorizer
+        X = super(TfidfVectorizer, self.vectorizer).transform(X)
+        len_X = X.sum(1).A1
+        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
+        assert sparse.isspmatrix_csr(q)
+
+        # convert to csc for better column slicing
+        X = X.tocsc()[:, q.indices]
+        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
+        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
+        # to idf(t) = log [ n / df(t) ] with minus 1
+        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
+        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
+        return (numer / denom).sum(1).A1
+
+
+# ------------ End of library impl. Followings are the example -----------------
+from sklearn.datasets import fetch_20newsgroups
+
+# documents = fetch_20newsgroups(subset='train').data
+g = rdflib.Graph()
+g.parse("fashionpedia-third-generation.owl", format="xml")
+
+# Extract triplets
+documents = []
+for subject in g.subjects():
+    txt = f"{str(subject)} \n"
+    # exclude uneccesary information
+    for predicate, obj in g.predicate_objects(subject):
+        txt += f"{obj} \n"
+    documents.append(txt)
+
+bm25 = BM25()
+bm25.fit(documents)
+# Find the similar documents given  query
+query = "What elegant and traditional bridal gowns are available?"
+target_iri = "http://example.org/image_47178"
+scores = bm25.transform(query, documents)
+storage = dict()
+for i, val in enumerate(documents):
+    iri = val.split()[0]
+    storage[iri] = scores[i]
+
+storage_sorted = {k: v for k, v in sorted(storage.items(), key=lambda item: item[1], reverse=True)}
+
+placement = list(storage_sorted.keys()).index(target_iri)
+
+print(f"target_iri placement: {placement} \n target_iri score: {storage_sorted[target_iri]}")
+print(f" Top scorer: {storage_sorted.items()}")
+
+# print(documents[np.argmax(scores)])
diff --git a/local_working_directory/embedding-retriever.py b/local_working_directory/embedding-retriever.py
@@ -0,0 +1,25 @@
+from openai import OpenAI
+import pandas as pd
+import numpy as np
+from numpy.linalg import norm
+
+query = 'I like a dress with wide neckline'
+
+df = pd.read_csv("embeddings_short2.csv", index_col=0, nrows=None)
+iris = df.index.values.tolist()
+
+client = OpenAI(base_url="http://tentris-ml.cs.upb.de:8502/v1", api_key="token-tentris-upb")
+
+docs = np.array(df.values)
+qr = np.array(client.embeddings.create(input=[query], model="tentris").data[0].embedding)
+
+docs_norms = docs / norm(docs, axis=1, keepdims=True)
+qr_norms = qr / norm(qr)
+
+cosine_similarities = (docs_norms @ qr_norms).flatten()
+
+best_match_index = np.argmax(cosine_similarities)
+best_similarity = cosine_similarities[best_match_index]
+
+print(cosine_similarities)
+print(f"The best scoring image is the image with iri: {iris[best_match_index]} and score: {best_similarity}")
diff --git a/local_working_directory/embeddings_short2.csv b/local_working_directory/embeddings_short2.csv
diff --git a/local_working_directory/fashionpedia-embeddings.csv b/local_working_directory/fashionpedia-embeddings.csv
diff --git a/local_working_directory/fashionpedia-embeddings.json b/local_working_directory/fashionpedia-embeddings.json
diff --git a/local_working_directory/image_18321_embeddings b/local_working_directory/image_18321_embeddings
diff --git a/local_working_directory/recommendation_script2_first_dataset.py b/local_working_directory/recommendation_script2_first_dataset.py
@@ -0,0 +1,82 @@
+import json
+from PIL import Image
+
+import matplotlib.pyplot as plt
+import numpy as np
+from owlapy.class_expression import OWLObjectHasValue
+from owlapy.iri import IRI
+from owlapy.owl_individual import OWLNamedIndividual
+from ontolearn.knowledge_base import KnowledgeBase
+from ontolearn.concept_learner import EvoLearner, CELOE
+from ontolearn.learning_problem import PosNegLPStandard
+from owlapy.owl_property import OWLObjectProperty, OWLDataProperty
+
+kb = KnowledgeBase(path="../fashionpedia-first-generation.owl")
+
+
+def add_namespace(ind):
+    return "http://example.org/" + ind
+
+
+pos = {'image_21007': "Contains dress from supercategory 'wholebody' with the following attributes: a-line of supercategory 'silhouette', mini (length) of supercategory 'length'", 'image_45433': "Contains dress from supercategory 'wholebody' with the following attributes: normal waist of supercategory 'waistline', gown of supercategory 'nickname', trumpet of supercategory 'silhouette', asymmetrical of supercategory 'silhouette', lining of supercategory 'textile finishing, manufacturing techniques', maxi (length) of supercategory 'length', plastic of supercategory 'non-textile material type', zip-up of supercategory 'opening type'", 'image_19012': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_42863': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_8113': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_26892': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_32017': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_42632': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'"}
+
+neg = {'image_34250': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'", 'image_20344': "Contains neckline from supercategory 'garment parts' with the following attributes: straight across (neck) of supercategory 'neckline type'"}
+
+with open('../image-filename-mappings.json', 'r') as file:
+    filename_of = json.load(file)
+oprop = OWLObjectProperty(IRI.create("http://example.org/hasImage"))
+oprop2 = OWLObjectProperty(IRI.create("http://example.org/hasCategory"))
+oprop3 = OWLObjectProperty(IRI.create("http://example.org/hasAttribute"))
+dprop4 = OWLDataProperty(IRI.create("http://example.org/hasName"))
+dprop5 = OWLDataProperty(IRI.create("http://example.org/hasSupercategory"))
+
+
+typed_pos = set()
+typed_neg = set()
+
+for i in set(pos.keys()).union(set(neg.keys())):
+    ind = OWLNamedIndividual(add_namespace(i))
+    annotations = kb.individuals(OWLObjectHasValue(oprop, ind))
+    final_description = ""
+    for annotation in annotations:
+        cat_ind = list(kb.get_object_property_values(annotation, oprop2))[0]
+        cat_name = list(kb.get_data_property_values(cat_ind, dprop4))[0].get_literal()
+        cat_supercat = list(kb.get_data_property_values(cat_ind, dprop5))[0].get_literal()
+        description = f"Contains {cat_name} from supercategory '{cat_supercat}'"
+        attrs = list(kb.get_object_property_values(annotation, oprop3))
+        if len(attrs) > 0:
+            description += " with the following attributes: "
+            for attr in attrs:
+                attr_name = list(kb.get_data_property_values(attr, dprop4))[0].get_literal()
+                attr_supercat = list(kb.get_data_property_values(attr, dprop5))[0].get_literal()
+                description += f"{attr_name} of supercategory '{attr_supercat}', "
+            final_description = description[:-2]
+        else:
+            final_description = description
+
+        if i in pos.keys() and final_description in pos[i]:
+            typed_pos.add(annotation)
+        elif i in neg.keys() and final_description in neg[i]:
+            typed_neg.add(annotation)
+
+print(typed_pos)
+print(typed_neg)
+lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg)
+
+model = CELOE(knowledge_base=kb, max_runtime=120)
+model.fit(lp, verbose=False)
+
+hypotheses = model.best_hypotheses(n=1)
+print(hypotheses)
+for ind in list(kb.individuals(hypotheses))[:10]:
+    print(ind)
+    filename = filename_of[ind.iri.reminder]
+    img = np.asarray(Image.open(f'../images/{filename}'))
+    plt.imshow(img)
+    plt.show()
+try:
+    print(":gdf")
+except Exception as e:
+    print(e)
+    with open("questions.json", "w") as outfile:
+        json.dump({"dsad": "dasd"}, outfile)