diff --git a/config_hh.json b/config_hh.json deleted file mode 100644 index 93192cd..0000000 --- a/config_hh.json +++ /dev/null @@ -1,80 +0,0 @@ -{ - "classes" : - { - "X400" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X400tohh_yybb.root" - ], - "X350" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X350tohh_yybb.root" - ], - "X325" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X325tohh_yybb.root" - ], - "H300" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_H300_Xtohh_yybb.root" - ], - "X275" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X275tohh_yybb.root" - ], - "bkg" : - [ - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_ybbj.root", - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_ybjj.root", - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yjjj.root", - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yybb.root", - "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yybj.root" - ] - }, - - "particles" : - { - "jet" : - { - "branches" : - [ - "jet_pt", - "jet_eta", - "jet_phi", - "jet_m", - "jet_Jvt", - "jet_MV2c10_FixedCutBEff_60", - "jet_MV2c10_FixedCutBEff_70", - "jet_MV2c10_FixedCutBEff_77", - "jet_MV2c10_FixedCutBEff_85" - ], - "max_length" : 5 - }, - - "photon": - { - "branches" : - [ - "photon_pt", - "photon_eta", - "photon_phi", - "photon_isTight", - "photon_ptcone20", - "photon_topoEtcone40" - ], - "max_length" : 3 - }, - - "event": - { - "branches" : - [ - "jet_n", - "met", - "met_phi", - "met_sumet", - "photon_n" - ], - "max_length" : 1 - } - } -} \ No newline at end of file diff --git a/data_processing.py b/data_processing.py index 2cd11ee..84d2f00 100644 --- a/data_processing.py +++ b/data_processing.py @@ -8,6 +8,7 @@ import logging from collections import OrderedDict from itertools import izip +import tqdm logger = logging.getLogger('data_processing') @@ -84,10 +85,10 @@ def _make_df(val, key, branches): if mode == 'classification': df['y'] = key elif mode == 'regression': - try: - df['y'] = int(key[1:]) - except ValueError: + if key == 'bkg': df['y'] = 0 + else: + df['y'] = int(key[1:]) return df all_events = pd.concat([_make_df(val, key, branches) for key, val in class_files_dict.iteritems()], ignore_index=True) @@ -107,7 +108,7 @@ def _make_df(val, key, branches): #w = all_events['HGamEventInfoAuxDyn.yybb_weight'].values w = np.ones(len(y)) - + return X, y, w, le @@ -168,6 +169,7 @@ def shuffle_split_scale(X, y, w): data = OrderedDict() for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])): + print particle data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test) data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:] diff --git a/nets/__init__.py b/nets/__init__.py new file mode 100644 index 0000000..78267cf --- /dev/null +++ b/nets/__init__.py @@ -0,0 +1,4 @@ +import nn +import nn_with_modes +import nn_combined +import functional_nn diff --git a/functional_nn.py b/nets/functional_nn.py similarity index 100% rename from functional_nn.py rename to nets/functional_nn.py diff --git a/nn.py b/nets/nn.py similarity index 100% rename from nn.py rename to nets/nn.py diff --git a/nets/nn_combined.py b/nets/nn_combined.py new file mode 100644 index 0000000..633a3c0 --- /dev/null +++ b/nets/nn_combined.py @@ -0,0 +1,109 @@ +from keras.models import Sequential +from keras.layers.core import Activation, Dense, Dropout +from keras.layers import Masking, GRU, Merge, Input, merge, Lambda +from keras.callbacks import EarlyStopping, ModelCheckpoint +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +import datetime +import time +import logging +import os + +def NN_train(data, model_name, mode): + ''' + Args: + data: dictionary containing relevant data + Returns: + recurrent neural network: A combined recurrent neural network trained on the different classes of the data + ''' + + #defines training sets of different classes + X_jets_train = data['X_jet_train'] + X_photons_train = data['X_photon_train'] + X_event_train = data['X_event_train'] + y_train = data['y_train'] + X_muons_train=data['X_muon_train'] + X_electrons_train=data['X_electron_train'] + + #set up sequential neural networks for the jet and photon classes + jet_channel = Sequential() + photon_channel = Sequential() + event_level = Sequential() + muon_channel=Sequential() + electron_channel=Sequential() + + #declaring the shape of the first row of each class matrix + JET_SHAPE = X_jets_train.shape[1:] + PHOTON_SHAPE = X_photons_train.shape[1:] + EVENT_SHAPE = X_event_train.shape[1] + MUON_SHAPE = X_muons_train.shape[1:] + ELECTRON_SHAPE = X_electrons_train.shape[1:] + + #adding layers to the jet and photon class neural networks + jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking')) + jet_channel.add(GRU(25, name='jet_gru')) + jet_channel.add(Dropout(0.3, name='jet_dropout')) + + photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking')) + photon_channel.add(GRU(10, name='photon_gru')) + photon_channel.add(Dropout(0.3, name='photon_dropout')) + + event_level.add(Lambda(lambda x: x, input_shape=(EVENT_SHAPE, ))) + + muon_channel.add(Masking(mask_value=-999, input_shape=MUON_SHAPE, name='muon_masking')) + muon_channel.add(GRU(10, name='muon_gru')) + muon_channel.add(Dropout(0.3, name='muon_dropout')) + + electron_channel.add(Masking(mask_value=-999, input_shape=ELECTRON_SHAPE, name='electron_masking')) + electron_channel.add(GRU(10, name='electron_gru')) + electron_channel.add(Dropout(0.3, name='electron_dropout')) + + + #combining the jet and photon classes to make a combined recurrent neural network + combined_rnn = Sequential() + combined_rnn.add(Merge([jet_channel, photon_channel, event_level, muon_channel, electron_channel], mode='concat')) + combined_rnn.add(Dense(36, activation='relu')) + combined_rnn.add(Dropout(0.3)) + combined_rnn.add(Dense(24, activation='relu')) + combined_rnn.add(Dropout(0.3)) + combined_rnn.add(Dense(12, activation='relu')) + combined_rnn.add(Dropout(0.3)) + if mode == 'classification': + combined_rnn.add(Dense(6, activation='softmax')) + combined_rnn.compile('adam', 'sparse_categorical_crossentropy') + + elif mode == 'regression': + combined_rnn.add(Dense(1)) + combined_rnn.compile('adam', 'mae') + + try: + weights_path = os.path.join('weights', 'combinedrnn-progress'+model_name+mode+'.h5') + combined_rnn.load_weights(weights_path) + print "Loaded Pre-trained Weights" + except IOError: + print 'Pre-trained weights not found' + + logger = logging.getLogger('Train') + logger.info('Compiling the net') + try: + combined_rnn.fit([X_jets_train, X_photons_train, X_event_train, X_muons_train, X_electrons_train], + y_train, batch_size=16, class_weight={ + k : (float(len(y_train)) / float(len(np.unique(y_train)) * (len(y_train[y_train == k])))) for k in np.unique(y_train) + }, + callbacks = [ + EarlyStopping(verbose=True, patience=20, monitor='val_loss'), + ModelCheckpoint(weights_path, + monitor='val_loss', verbose=True, save_best_only=True) + ], + nb_epoch=1, validation_split = 0.2) + + except KeyboardInterrupt: + print 'Training ended early.' + + #saving the combined recurrent neural network + combined_rnn.load_weights(weights_path) + combined_rnn_json=combined_rnn.to_json() + open('TestModel'+model_name+'.json','w').write(combined_rnn_json) + + return combined_rnn diff --git a/nn_with_modes.py b/nets/nn_with_modes.py similarity index 93% rename from nn_with_modes.py rename to nets/nn_with_modes.py index a9db839..f47db46 100644 --- a/nn_with_modes.py +++ b/nets/nn_with_modes.py @@ -7,9 +7,7 @@ import matplotlib.pyplot as plt import os -MODEL_NAME = 'jennymodes_nobtag' - -def train(data, mode): +def train(data, model_name, mode): ''' Args: data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: @@ -23,6 +21,7 @@ def train(data, mode): "w_train" : w_train, "w_test" : w_test } + model_name: string, nn identifier mode: a string specifying the type of task, either 'regression' or 'classification' Returns: combine_rnn: a Sequential trained on data @@ -81,7 +80,7 @@ def train(data, mode): combined_rnn.summary() try: - weights_path = os.path.join('weights', MODEL_NAME + '-progress.h5') + weights_path = os.path.join('weights', model_name + '-progress.h5') combined_rnn.load_weights(weights_path) except IOError: print 'Pre-trained weights not found' @@ -106,9 +105,9 @@ def train(data, mode): # -- load best weights back into the net combined_rnn.load_weights(weights_path) - return combined_rnn, MODEL_NAME + return combined_rnn -def test(net, data): +def test(net, data, model_name): ''' Args: net: a Sequential instance trained on data @@ -153,9 +152,8 @@ def test(net, data): X_photon_test = data['X_photon_test'] X_muon_test = data['X_muon_test'] X_event_test = data['X_event_test'] - y_test= data ['y_test'] yhat = net.predict([X_jet_test, X_photon_test, X_muon_test, X_event_test], verbose=True, batch_size=1024) - np.save('yhat_' + MODEL_NAME + '.npy', yhat) + np.save('yhat_' + model_name + '.npy', yhat) return yhat \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index a7821c7..11ca8fa 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,14 +1,16 @@ import json -from data_processing import read_in, shuffle_split_scale, padding import numpy as np import pandautils as pup import cPickle -import utils import logging -from plotting import plot_inputs, plot_confusion, plot_regression, save_roc_curves -from nn_with_modes import train, test +import deepdish.io as io + +from data_processing import read_in, shuffle_split_scale, padding +from plotting import plot_performance +from nets import nn_with_modes +import utils -def main(json_config, mode, tree_name): +def main(json_config, tree_name, model_name, mode): ''' Args: ----- @@ -55,7 +57,7 @@ def sha(s): try: logger.info('Attempting to read from {}'.format(pickle_name)) data = cPickle.load(open(pickle_name, 'rb')) - logger.info('Pre-processed data found and loaded from pickle') + logger.info('Pre-processed data found and loaded from pickle') # -- otherwise, process the new data except IOError: logger.info('Pre-processed data not found in {}'.format(pickle_name)) @@ -64,8 +66,8 @@ def sha(s): X, y, w, le = read_in(class_files_dict, tree_name, particles_dict, mode) # -- shuffle, split samples into train and test set, scale features - data = shuffle_split_scale(X, y, w) - + data = shuffle_split_scale(X, y, w) + data.update({ 'varlist' : [ branch @@ -74,7 +76,6 @@ def sha(s): ], 'LabelEncoder' : le }) - # -- plot distributions: ''' This should produce normed, weighted histograms of the input distributions for all variables @@ -96,23 +97,20 @@ def sha(s): open(pickle_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) - # # -- train - # # design a Keras NN with three RNN streams (jets, photons, muons) - # # combine the outputs and process them through a bunch of FF layers - # # use a validation split of 20% - # # save out the weights to hdf5 and the model to yaml - net, model_name = train(data, mode) + # -- plot distributions: - # # -- test - # # evaluate performance on the test set - yhat = test(net, data) + # -- train + # design a Keras NN with three RNN streams (jets, photons, muons) + # combine the outputs and process them through a bunch of FF layers + # use a validation split of 20% + # save out the weights to hdf5 and the model to json + net = nn_with_modes.train(data, model_name, mode) + yhat = nn_with_modes.test(net, data, model_name) - # # -- plot performance by mode - if mode == 'regression': - plot_regression(yhat, data) - if mode == 'classification': - plot_confusion(yhat, data) - save_roc_curves(yhat, data, model_name) + # -- plot performance by mode + plot_performance(yhat, data, model_name, mode) + +# -------------------------------------------------------------- if __name__ == '__main__': @@ -124,12 +122,13 @@ def sha(s): # -- read in arguments parser = argparse.ArgumentParser() parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths") + parser.add_argument('model_name', help="name of the set from particular network") parser.add_argument('mode', help="classification or regression") - parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini') + parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='CollectionTree') args = parser.parse_args() if args.mode != 'classification' and args.mode != 'regression': raise ValueError('Mode must be classification or regression') # -- pass arguments to main - sys.exit(main(args.config, args.mode, args.tree)) \ No newline at end of file + sys.exit(main(args.config, args.tree, args.model_name, args.mode)) diff --git a/plotting.py b/plotting.py index 80e8425..136b802 100644 --- a/plotting.py +++ b/plotting.py @@ -5,21 +5,59 @@ import pandautils as pup import os from sklearn.preprocessing import LabelEncoder +from viz import calculate_roc, ROC_plotter, add_curve +import cPickle from sklearn.metrics import confusion_matrix from viz import ROC_plotter, add_curve, calculate_roc import cPickle +def plot_inputs(data, particles_dict): + ''' + Args: + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + #particle_names: list of strings, names of particle streams + particles_dict: + Returns: + Saves .pdf histograms plotting the training and test + sets of each class for each feature + ''' + + for particle in particles_dict.keys(): + _plot_X( + data['X_' + particle + '_train'], + data['X_' + particle + '_test'], + data['y_train'], + data['y_test'], + data['w_train'], + data['w_test'], + data['LabelEncoder'], + particle, + particles_dict + ) + +# -------------------------------------------------------------- + def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particles_dict): ''' Args: train: ndarray [n_ev_train, n_muon_feat] containing the events allocated for training - test: ndarray [n_ev_test, n_muon_feat] containing the events allocated for testing - y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training in numerical format - y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing in numerical format - w_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training - w_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing - varlist: list of names of branches like 'jet_px', 'photon_E', 'muon_Iso' - le: LabelEncoder to transform numerical y back to its string values + test: ndarray [n_ev_test, n_muon_feat] containing the events allocated for testing + y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training in numerical format + y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing in numerical format + w_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training + w_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing + varlist: list of names of branches like 'jet_px', 'photon_E', 'muon_Iso' + le: LabelEncoder to transform numerical y back to its string values particle: a string like 'jet', 'muon', 'photon', ... particles_dict: Returns: @@ -38,6 +76,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl # -- loop through the variables for column_counter, key in enumerate(varlist): + print key flat_train = pup.flatten(train[:, column_counter]) flat_test = pup.flatten(test[:, column_counter]) @@ -89,105 +128,45 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl os.makedirs('plots') plt.savefig(os.path.join('plots', key + '.pdf')) -def plot_inputs(data, particles_dict): - ''' - Args: - data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: - data = { - "X_jet_train" : X_jet_train, - "X_jet_test" : X_jet_test, - "X_photon_train" : X_photon_train, - "X_photon_test" : X_photon_test, - "y_train" : y_train, - "y_test" : y_test, - "w_train" : w_train, - "w_test" : w_test - } - #particle_names: list of strings, names of particle streams - particles_dict: - Returns: - Saves .pdf histograms plotting the training and test - sets of each class for each feature - ''' - - for particle in particles_dict.keys(): - _plot_X( - data['X_' + particle + '_train'], - data['X_' + particle + '_test'], - data['y_train'], - data['y_test'], - data['w_train'], - data['w_test'], - data['LabelEncoder'], - particle, - particles_dict - ) +# -------------------------------------------------------------- -def plot_confusion(yhat, data): - ''' - Args: - yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data - data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: - data = { - "X_jet_train" : X_jet_train, - "X_jet_test" : X_jet_test, - "X_photon_train" : X_photon_train, - "X_photon_test" : X_photon_test, - "y_train" : y_train, - "y_test" : y_test, - "w_train" : w_train, - "w_test" : w_test - } - Returns: - Saves confusion.pdf confusion matrix - ''' - - y_test = data['y_test'] - le = data['LabelEncoder'] - - def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(np.unique(y_test))) - plt.xticks(tick_marks, sorted(np.unique(y_test))) - plt.yticks(tick_marks, sorted(np.unique(y_test))) - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') - - cm = confusion_matrix(y_test, np.argmax(yhat, axis=1)) - # Normalize the confusion matrix by row (i.e by the number of samples - # in each class) - cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - _plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') - plt.savefig('confusion.pdf') +def plot_performance(yhat, data, model_name, mode): + if mode == 'regression': + plot_regression(yhat, data, model_name) + elif mode == 'classification': + plot_yhat(yhat, data, model_name) + plot_confusion(yhat, data, model_name) + plot_roc(yhat, data, model_name) + else: + raise ValueError('Mode must be classification or regression') +# -------------------------------------------------------------- -def plot_regression(yhat, data): +def plot_regression(yhat, data, model_name): ''' Args: yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: - data = { - "X_jet_train" : X_jet_train, - "X_jet_test" : X_jet_test, - "X_photon_train" : X_photon_train, - "X_photon_test" : X_photon_test, - "y_train" : y_train, - "y_test" : y_test, - "w_train" : w_train, - "w_test" : w_test - } + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } Saves: 'regression_test.pdf': a histogram plotting yhat containing the predicted masses ''' - y_test = data['y_test'].values + y_test = data['y_test'] w_test = data['w_test'] color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test))))) matplotlib.rcParams.update({'font.size': 16}) + plt.clf() fig = plt.figure(figsize=(11.69, 8.27), dpi=100) bins = np.linspace( @@ -208,21 +187,102 @@ def plot_regression(yhat, data): plt.ylabel('Weighted Events') plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5) - plt.savefig('regression.pdf') + fig.savefig('regression' + model_name + '.pdf') +# -------------------------------------------------------------- -def save_roc_curves(yhat, data, model_name): +def plot_yhat(yhat, data, model_name): ''' Args: - yhat: an ndarray of the probability of each event for each class - data: dictionary containing X, y, w ndarrays - model_name: + yhat: an ndarray of the probability of each event for each class + data: dictionary containing relevant data + Returns: + a plot of the probability that each event in a known classes is predicted to be in a specific class + ''' + y_test = data['y_test'] + w_test = data['w_test'] + matplotlib.rcParams.update({'font.size': 16}) + bins = np.linspace(0, 1, 30) + plt.clf() + + #find probability of each class + for k in np.unique(y_test): + fig = plt.figure(figsize=(11.69, 8.27), dpi=100) + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test))))) + #find the truth label for each class + for j in np.unique(y_test): + c = next(color) + _ = plt.hist( + yhat[:, k][y_test == j], + bins=bins, + histtype='step', + normed=True, + label=data['LabelEncoder'].inverse_transform(j), + weights=w_test[y_test == j], + color=c, + linewidth=1 + ) + plt.xlabel('P(y == {})'.format(data['LabelEncoder'].inverse_transform(k))) + plt.ylabel('Weighted Normalized Number of Events') + plt.legend() + fig.savefig('p(y=={})_'.format(data['LabelEncoder'].inverse_transform(k)) + model_name + '.pdf') + +# -------------------------------------------------------------- + +def plot_confusion(yhat, data, model_name): + ''' + Args: + yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } Returns: - plot: - pickle file: pkl file dictionary with each curve + Saves confusion.pdf confusion matrix ''' + + y_test = data['y_test'] + le = data['LabelEncoder'] + plt.clf() + + def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(np.unique(y_test))) + plt.xticks(tick_marks, sorted(np.unique(y_test))) + plt.yticks(tick_marks, sorted(np.unique(y_test))) + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + + cm = confusion_matrix(y_test, np.argmax(yhat, axis=1)) + # Normalize the confusion matrix by row (i.e by the number of samples + # in each class) + cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + _plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') + plt.savefig('confusion' + model_name + '.pdf') +# -------------------------------------------------------------- +def plot_roc(yhat, data, model_name): + ''' + Args: + yhat: an ndarray of the probability of each event for each class + data: dictionary containing X, y, w ndarrays + model_name: + Returns: + plot: + pickle file: pkl file dictionary with each curve + ''' + # -- hardcoded in from cutflow!! extract them instead cutflow_eff = [0.0699191919192, 0.0754639175258, 0.08439, 0.0921212121212, 0.110275510204, 0.00484432269559] y_test = data['y_test'] @@ -249,11 +309,8 @@ def save_roc_curves(yhat, data, model_name): title=k_string + r' vs. Sherpa $\gamma \gamma$ Background', min_eff=0.05, max_eff=1.0, ymax=500, logscale=False) - plt.scatter(cutflow_eff[k], 1. / cutflow_eff[bkg_col], label='Cutflow -' + k_string) + plt.scatter(cutflow_eff[k], 1. / cutflow_eff[bkg_col], label='Cutflow ' + k_string) plt.legend() matplotlib.rcParams.update({'font.size': 16}) fig.savefig('roc_' + k_string + '_' + model_name +'.pdf') - cPickle.dump(pkl_dict, open(model_name + '.pkl', 'wb')) - - - + cPickle.dump(pkl_dict, open(model_name + '.pkl', 'wb')) \ No newline at end of file diff --git a/utils.py b/utils.py index 505b4d1..02620d5 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,7 @@ import logging import json + def configure_logging(): rlogger = logging.getLogger() rlogger.setLevel(logging.INFO)