From 7efc1a8abf0ea7b706993836eacc7d37ceeae83f Mon Sep 17 00:00:00 2001
From: Gigi Stark <gigifstark@Gigis-MacBook-Pro.local>
Date: Wed, 13 Jul 2016 14:30:35 +0200
Subject: [PATCH] With function that plots ROC curves

---
 nn_combined.py | 12 ++++-----
 pipeline.py    | 18 +++++++-------
 plotting.py    | 67 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/nn_combined.py b/nn_combined.py
index 817d19f..fbd9cc3 100644
--- a/nn_combined.py
+++ b/nn_combined.py
@@ -8,7 +8,7 @@
 import datetime
 import time
 
-def NN_train(data):
+def NN_train(data, model_name):
     '''
     Args:
         data: dictionary containing relevant data
@@ -41,6 +41,8 @@ def NN_train(data):
     #combining the jet and photon classes to make a combined recurrent neural network
     combined_rnn = Sequential()
     combined_rnn.add(Merge([jet_channel, photon_channel], mode='concat'))
+    combined_rnn.add(Dense(72, activation='relu'))
+    combined_rnn.add(Dropout(0.3))   
     combined_rnn.add(Dense(36, activation='relu'))
     combined_rnn.add(Dropout(0.3))
     combined_rnn.add(Dense(24, activation='relu'))
@@ -54,7 +56,7 @@ def NN_train(data):
     print 'Training:'
     try:
         combined_rnn.fit([X_jets_train, X_photons_train], 
-            y_train, batch_size=16, class_weight={
+            y_train, batch_size=100, class_weight={
                 k : (float(len(y_train)) / float(len(np.unique(y_train)) * (len(y_train[y_train == k])))) for k in np.unique(y_train)
             },
             callbacks = [
@@ -68,8 +70,7 @@ def NN_train(data):
         print 'Training ended early.'
 
     #saving the combined recurrent neural network
-    setType=raw_input("What set is this?")
-    combined_rnn.save_weights('TestModel'+setType+'.H5')
+    combined_rnn.save_weights('TestModel_'+model_name+'.H5')
     combined_rnn_json=combined_rnn.to_json()
     open('TestModel.json','w').write(combined_rnn_json)
 
@@ -89,9 +90,6 @@ def NN_test(net, data):
     y_test=data['y_test']
     w_test=data['w_test']
 
-    print y_test.shape
-    print w_test.shape
-
     yhat_rnn = net.predict([X_jets_test, X_photons_test], verbose = True, batch_size = 512) 
     
     return yhat_rnn
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index a4826d3..49bbdb0 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -7,12 +7,12 @@
 import logging
 from nn_combined import NN_train, NN_test
 import deepdish.io as io
-from plotting import plot_NN
+from plotting import plot_NN, plot_roc_Curve
 #from plotting import plot_inputs, plot_performance
 
 #from nn_model import train, test
 
-def main(json_config, tree_name):
+def main(json_config, model_name, tree_name):
     '''
     Args:
     -----
@@ -59,7 +59,7 @@ def sha(s):
     try:
         logger.info('Attempting to read from {}'.format(pickle_name))
         data = cPickle.load(open(pickle_name, 'rb'))
-        logger.info('Pre-processed data found and loaded from pickle')
+        logger.info('Pre-processed data found and loaded from pickle') 
     # -- otherwise, process the new data 
     except IOError:
         logger.info('Pre-processed data not found in {}'.format(pickle_name))
@@ -101,22 +101,21 @@ def sha(s):
 
     # # -- train
     # # design a Keras NN with three RNN streams (jets, photons, muons)
-    # # -- train
-    # # design a Keras NN with three RNN streams (jets, photons, muons)
-  
+    le=data['LabelEncoder']
     # # combine the outputs and process them through a bunch of FF layers
     # # use a validation split of 20%
     # # save out the weights to hdf5 and the model to yaml
-    net=NN_train(data)
+    net=NN_train(data, model_name)
    
     # # -- test
     # # evaluate performance on the test set
     yhat=NN_test(net, data)
 
     # # -- plot performance
-    plot_NN(yhat, data)
+    #plot_NN(yhat, data)
 
     # # produce ROC curves to evaluate performance
+    plot_roc_Curve(yhat, data, le, model_name)
     # # save them out to pdf
     # plot_performance(yhat, data['y_test'], data['w_test'])
 
@@ -130,8 +129,9 @@ def sha(s):
     # -- read in arguments
     parser = argparse.ArgumentParser()
     parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths")
+    parser.add_argument('model_name', help="name of the set from particular network")
     parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
     args = parser.parse_args()
 
     # -- pass arguments to main
-    sys.exit(main(args.config, args.tree))
+    sys.exit(main(args.config, args.model_name, args.tree))
diff --git a/plotting.py b/plotting.py
index c03f3d4..c6ff1de 100644
--- a/plotting.py
+++ b/plotting.py
@@ -5,6 +5,8 @@
 import pandautils as pup
 import os
 from sklearn.preprocessing import LabelEncoder
+from viz import calculate_roc, ROC_plotter, add_curve
+import cPickle 
 
 def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, le, feature):
 	'''
@@ -120,19 +122,54 @@ def plot_NN(yhat, data):
 	matplotlib.rcParams.update({'font.size': 16})
 	fig = plt.figure(figsize=(11.69, 8.27), dpi=100)
 	bins = np.linspace(0,1,30)
+	#find probability of each class 
 	for k in range(len(np.unique(y_test))):
-			color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test)))))
-			for j in range (len(np.unique(y_test))):
-				c = next(color)
-				_ = plt.hist(yhat[:,k][y_test==j], 
-				bins=bins, 
-				histtype='step', 
-				normed=True, 
-				label='Y=' + str(j),
-				weights=w_test[y_test == j],
-				color=c, 
-				linewidth=1)
-			plt.xlabel('Probabilty of Y=' +str(k)) 
-			plt.ylabel('Weighted Normalized Number of Events')
-			plt.legend()
-			plt.show()
+		print k
+		color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test)))))
+		#find the truth label for each class
+		for j in range (len(np.unique(y_test))):
+			c = next(color)
+			_ = plt.hist(yhat[:,k][y_test==j], 
+			bins=bins, 
+			histtype='step', 
+			normed=True, 
+			label='Y=' + str(j),
+			weights=w_test[y_test == j],
+			color=c, 
+			linewidth=1)
+		plt.xlabel('Probabilty of Y=' +str(k)) 
+		plt.ylabel('Weighted Normalized Number of Events')
+		plt.legend()
+		plt.savefig('/Users/gigifstark/CERN_Work/HH2YBB')
+
+def plot_roc_Curve(yhat, data, le, model_name):
+	'''
+    Args:
+        yhat: an ndarray of the probability of each event for each class
+        data: dictionary containing relevant data
+    Returns:
+		plot: MatLibPlot for each particle with different mass compared to background
+		pickle file: pkl file dictionary with each curve
+    '''
+	y_test=data['y_test']
+	w_test=data['w_test']
+	pkl_dict={}
+	for k in range(0, len(np.unique(y_test))-1):
+		sig_back= (y_test==k)|(y_test==5)
+		y=np.log(yhat[sig_back][:,k]/yhat[sig_back][:,5])
+		finite= np.isfinite(y)
+		curves_dictionary=add_curve ("Y="+str(k), 'blue', 
+			calculate_roc(
+				y_test[sig_back][finite], 
+				np.log(yhat[sig_back][finite][:,k]/yhat[sig_back][finite][:,5]), 
+				pos_label=k, 
+				weights=w_test[sig_back][finite]
+			)
+			)
+		pkl_dict.update(curves_dictionary)
+		print 'Plotting'
+		fig=ROC_plotter(curves_dictionary, model_name, title=le.inverse_transform(k), min_eff = 0.1, max_eff=1.0, logscale=True)
+		plt.ylim([0,100])
+		fig.savefig('/Users/gigifstark/CERN_Work/HH2YBB/roc'+ str(k)+'.pdf')
+	cPickle.dump(pkl_dict, open(trial+"_pkl", 'wb'))
+