From 7efc1a8abf0ea7b706993836eacc7d37ceeae83f Mon Sep 17 00:00:00 2001 From: Gigi Stark Date: Wed, 13 Jul 2016 14:30:35 +0200 Subject: [PATCH] With function that plots ROC curves --- nn_combined.py | 12 ++++----- pipeline.py | 18 +++++++------- plotting.py | 67 +++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 66 insertions(+), 31 deletions(-) diff --git a/nn_combined.py b/nn_combined.py index 817d19f..fbd9cc3 100644 --- a/nn_combined.py +++ b/nn_combined.py @@ -8,7 +8,7 @@ import datetime import time -def NN_train(data): +def NN_train(data, model_name): ''' Args: data: dictionary containing relevant data @@ -41,6 +41,8 @@ def NN_train(data): #combining the jet and photon classes to make a combined recurrent neural network combined_rnn = Sequential() combined_rnn.add(Merge([jet_channel, photon_channel], mode='concat')) + combined_rnn.add(Dense(72, activation='relu')) + combined_rnn.add(Dropout(0.3)) combined_rnn.add(Dense(36, activation='relu')) combined_rnn.add(Dropout(0.3)) combined_rnn.add(Dense(24, activation='relu')) @@ -54,7 +56,7 @@ def NN_train(data): print 'Training:' try: combined_rnn.fit([X_jets_train, X_photons_train], - y_train, batch_size=16, class_weight={ + y_train, batch_size=100, class_weight={ k : (float(len(y_train)) / float(len(np.unique(y_train)) * (len(y_train[y_train == k])))) for k in np.unique(y_train) }, callbacks = [ @@ -68,8 +70,7 @@ def NN_train(data): print 'Training ended early.' #saving the combined recurrent neural network - setType=raw_input("What set is this?") - combined_rnn.save_weights('TestModel'+setType+'.H5') + combined_rnn.save_weights('TestModel_'+model_name+'.H5') combined_rnn_json=combined_rnn.to_json() open('TestModel.json','w').write(combined_rnn_json) @@ -89,9 +90,6 @@ def NN_test(net, data): y_test=data['y_test'] w_test=data['w_test'] - print y_test.shape - print w_test.shape - yhat_rnn = net.predict([X_jets_test, X_photons_test], verbose = True, batch_size = 512) return yhat_rnn \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index a4826d3..49bbdb0 100644 --- a/pipeline.py +++ b/pipeline.py @@ -7,12 +7,12 @@ import logging from nn_combined import NN_train, NN_test import deepdish.io as io -from plotting import plot_NN +from plotting import plot_NN, plot_roc_Curve #from plotting import plot_inputs, plot_performance #from nn_model import train, test -def main(json_config, tree_name): +def main(json_config, model_name, tree_name): ''' Args: ----- @@ -59,7 +59,7 @@ def sha(s): try: logger.info('Attempting to read from {}'.format(pickle_name)) data = cPickle.load(open(pickle_name, 'rb')) - logger.info('Pre-processed data found and loaded from pickle') + logger.info('Pre-processed data found and loaded from pickle') # -- otherwise, process the new data except IOError: logger.info('Pre-processed data not found in {}'.format(pickle_name)) @@ -101,22 +101,21 @@ def sha(s): # # -- train # # design a Keras NN with three RNN streams (jets, photons, muons) - # # -- train - # # design a Keras NN with three RNN streams (jets, photons, muons) - + le=data['LabelEncoder'] # # combine the outputs and process them through a bunch of FF layers # # use a validation split of 20% # # save out the weights to hdf5 and the model to yaml - net=NN_train(data) + net=NN_train(data, model_name) # # -- test # # evaluate performance on the test set yhat=NN_test(net, data) # # -- plot performance - plot_NN(yhat, data) + #plot_NN(yhat, data) # # produce ROC curves to evaluate performance + plot_roc_Curve(yhat, data, le, model_name) # # save them out to pdf # plot_performance(yhat, data['y_test'], data['w_test']) @@ -130,8 +129,9 @@ def sha(s): # -- read in arguments parser = argparse.ArgumentParser() parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths") + parser.add_argument('model_name', help="name of the set from particular network") parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini') args = parser.parse_args() # -- pass arguments to main - sys.exit(main(args.config, args.tree)) + sys.exit(main(args.config, args.model_name, args.tree)) diff --git a/plotting.py b/plotting.py index c03f3d4..c6ff1de 100644 --- a/plotting.py +++ b/plotting.py @@ -5,6 +5,8 @@ import pandautils as pup import os from sklearn.preprocessing import LabelEncoder +from viz import calculate_roc, ROC_plotter, add_curve +import cPickle def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, le, feature): ''' @@ -120,19 +122,54 @@ def plot_NN(yhat, data): matplotlib.rcParams.update({'font.size': 16}) fig = plt.figure(figsize=(11.69, 8.27), dpi=100) bins = np.linspace(0,1,30) + #find probability of each class for k in range(len(np.unique(y_test))): - color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test))))) - for j in range (len(np.unique(y_test))): - c = next(color) - _ = plt.hist(yhat[:,k][y_test==j], - bins=bins, - histtype='step', - normed=True, - label='Y=' + str(j), - weights=w_test[y_test == j], - color=c, - linewidth=1) - plt.xlabel('Probabilty of Y=' +str(k)) - plt.ylabel('Weighted Normalized Number of Events') - plt.legend() - plt.show() + print k + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test))))) + #find the truth label for each class + for j in range (len(np.unique(y_test))): + c = next(color) + _ = plt.hist(yhat[:,k][y_test==j], + bins=bins, + histtype='step', + normed=True, + label='Y=' + str(j), + weights=w_test[y_test == j], + color=c, + linewidth=1) + plt.xlabel('Probabilty of Y=' +str(k)) + plt.ylabel('Weighted Normalized Number of Events') + plt.legend() + plt.savefig('/Users/gigifstark/CERN_Work/HH2YBB') + +def plot_roc_Curve(yhat, data, le, model_name): + ''' + Args: + yhat: an ndarray of the probability of each event for each class + data: dictionary containing relevant data + Returns: + plot: MatLibPlot for each particle with different mass compared to background + pickle file: pkl file dictionary with each curve + ''' + y_test=data['y_test'] + w_test=data['w_test'] + pkl_dict={} + for k in range(0, len(np.unique(y_test))-1): + sig_back= (y_test==k)|(y_test==5) + y=np.log(yhat[sig_back][:,k]/yhat[sig_back][:,5]) + finite= np.isfinite(y) + curves_dictionary=add_curve ("Y="+str(k), 'blue', + calculate_roc( + y_test[sig_back][finite], + np.log(yhat[sig_back][finite][:,k]/yhat[sig_back][finite][:,5]), + pos_label=k, + weights=w_test[sig_back][finite] + ) + ) + pkl_dict.update(curves_dictionary) + print 'Plotting' + fig=ROC_plotter(curves_dictionary, model_name, title=le.inverse_transform(k), min_eff = 0.1, max_eff=1.0, logscale=True) + plt.ylim([0,100]) + fig.savefig('/Users/gigifstark/CERN_Work/HH2YBB/roc'+ str(k)+'.pdf') + cPickle.dump(pkl_dict, open(trial+"_pkl", 'wb')) +