-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_utils.py
128 lines (109 loc) · 4.12 KB
/
ml_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
# pretrained resnet50 for embeddings
# logistic regression for interactions as labels
# predict scores for new images based on users past
EMBEDDING_FILE = '/app/data/image_embeddings.pkl'
MODEL_FILE = '/app/data/recommender_model.pkl'
# Simple image transformation for extracting embeddings of images
image_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
def load_embedding_model():
# Use pretrained resnet50 model and remove the final classification layer to make it for general purpose
model = models.resnet50(pretrained=True)
model = nn.Sequential(*list(model.children())[:-1]) # remove the last fc layer
model.eval()
return model
def extract_image_embedding(model, image_path):
image = Image.open(image_path).convert('RGB')
tensor = image_transform(image).unsqueeze(0)
with torch.no_grad():
embedding = model(tensor).numpy().flatten()
return embedding
def generate_embeddings(image_dir, image_list):
"""
Generate embeddings for all images and store in a dictionary.
image_list: List of dicts with {"id": sample.id, "url": "...", "labels": "..."}
"""
if os.path.exists(EMBEDDING_FILE):
with open(EMBEDDING_FILE, 'rb') as f:
embeddings = pickle.load(f)
else:
embeddings = {}
model = load_embedding_model()
for img in image_list:
image_filename = img['url'].replace('/images/', '')
if img['id'] not in embeddings:
image_path = os.path.join(image_dir, image_filename)
embedding = extract_image_embedding(model, image_path)
embeddings[img['id']] = embedding
with open(EMBEDDING_FILE, 'wb') as f:
pickle.dump(embeddings, f)
def load_embeddings():
with open(EMBEDDING_FILE, 'rb') as f:
embeddings = pickle.load(f)
return embeddings
def build_training_data(interactions, label_map, embeddings):
"""
Build (X, y) for model training.
- interactions: list of tuples from DB: (id, username, image_id, action, timestamp, hover_time, comment)
- label_map: dict of image_id -> [labels]
- embeddings: dict of image_id -> embedding vector
"""
X = []
y = []
for interaction in interactions:
# unpack the interaction
_, username, image_id, action, _, hover_time, _ = interaction
if image_id not in embeddings:
continue
# Simple binary labels, 1 for positive actions (like, comment, but not hovers for now), and 0 for everything else
# Can adjust later if needed
positive_actions = ['like', 'comment']
label = 1 if action in positive_actions else 0
# Ignore hover_time for now since we don't consider it important feature for training. Store as X for now.
emb = embeddings[image_id]
X.append(emb)
y.append(label)
return np.array(X), np.array(y)
def train_model(X, y):
if len(X) < 10: # Need enough data
print("Not enough data to train model.")
return
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
with open(MODEL_FILE, 'wb') as f:
pickle.dump(model, f)
print("Model trained and saved.")
def load_model():
if not os.path.exists(MODEL_FILE):
return None
with open(MODEL_FILE, 'rb') as f:
model = pickle.load(f)
return model
def predict_scores(model, embeddings, image_ids):
"""
Given a model and a list of image_ids, return predicted scores.
"""
X = []
for img_id in image_ids:
if img_id in embeddings:
X.append(embeddings[img_id])
else:
# Use zero vector for missing embeddings
X.append(np.zeros((2048,))) # ResNet50 embedding size
X = np.array(X)
scores = model.predict_proba(X)[:, 1] # Probability of class=1 which would be positive
return scores