-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconf_pred_with_training.py
86 lines (65 loc) · 3.28 KB
/
conf_pred_with_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#
# apply_RF.py
#
# Implementation of confidence estimation algorithm in Python.
# The implementation was tested on Python 3.7.
# In order to plot the graphs, a working installation of matplotlib is required.
# The confidence prediction algorithm can be run by executing: `ipython apply_RF.py` by specifying the required inputs of the algorithm.
#
#
# This framework implement the following steps, respectively:
#
# - Extract the features from the original and embedding spaces of the dataset according to 6 distance measures ("euclidean", "cosine", "correlation", "chebyshev", "canberra", "braycurtis" )
# - Calculate the ground truth confident scores from the training set using the embedding and labels that will be used as an input for the RF regression algorithm (the model learns the confidence scores in supervised manner)
# - Run the RF regressor on training set by using gridsearch to get the optimized model.
# - Calculate the confidence scores for test set using the best RF model obtained.
#
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pickle
from extract_features import extract_feats
from calc_error import find_error_score, calc_npr
from evaluate_model import evaluate_regression
from apply_tsne import tsne_p
import os
def confidence_prediction(X_train, X_train_emb, X_test, X_test_emb, y_train):
X_train_feat = extract_feats(X_train, X_train_emb)
X_test_feat = extract_feats(X_test, X_test_emb)
#reshaping feature data
X_train_feat = np.reshape(X_train_feat, [X_train_feat.shape[0], X_train_feat.shape[1]*X_train_feat.shape[2]])
X_test_feat = np.reshape(X_test_feat, [X_test_feat.shape[0], X_test_feat.shape[1]*X_test_feat.shape[2]])
#calculate error scores for training set (y_tr_score)
y_tr_ind, y_tr_score = find_error_score(X_train_emb, y_train) # y_tr_ind--indexes of erroneous samples
y_tra = np.ones(X_train_emb.shape[0]) # y_tra--binary labels (erroneous-0,corrects-1)
y_tra[y_tr_ind] = 0
y_tr_score= np.array(y_tr_score)
#Train the model
print("training of the model")
param_grid = {
'n_estimators': [20, 50, 100, 200],
'max_features': ["auto", "sqrt"],
'criterion': ["squared_error"],
'max_depth': [2, 5, 10, 20],
'min_samples_split': [2, 5, 10, 20],
}
grid = GridSearchCV(RandomForestRegressor(random_state=42),
param_grid, refit=True, verbose=0, cv=3)
grid.fit(X_train_feat, y_tr_score)
#test on best model
clf = grid.best_estimator_
clf.fit(X_train_feat, y_tr_score)
pred_conf_score = clf.predict(X_test_feat)
#save model
pickle.dump(clf, open("best_RF_model.sav", 'wb'))
print('completed')
return pred_conf_score
if __name__ == "__main__":
X_train = np.load('../X_test.npy') #original train set
X_test = np.load('../X_train.npy') #original test set
y_train = np.load('../y_train.npy', allow_pickle=True) # label_encoded_target_values for train set
#calculation of the t-SNE
X_train_emb = tsne_p(X_train, dim=2, perplexity=30)
X_test_emb = tsne_p(X_test, dim=2, perplexity=30)
#calculate confidence scores
predicted_confidence_scores = confidence_prediction(X_train, X_train_emb, X_test, X_test_emb, y_train)