From a1515a84e0200db4362b0fd5cbd729bbf6adc51f Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-22-101.eu-central-1.compute.internal>
Date: Tue, 19 Jul 2016 16:04:11 +0000
Subject: [PATCH] add regression capability

---
 data_processing.py |  25 ++++++---
 nn_with_modes.py   | 126 +++++++++++++++++++++++++++++++++++++++++++++
 pipeline.py        |  27 +++++++---
 plotting.py        |  62 ++++++++++++++++++++--
 utils.py           |   5 ++
 5 files changed, 226 insertions(+), 19 deletions(-)
 create mode 100644 nn_with_modes.py

diff --git a/data_processing.py b/data_processing.py
index 8275b03..f5a348f 100644
--- a/data_processing.py
+++ b/data_processing.py
@@ -17,7 +17,7 @@ def _pairwise(iterable):
     return izip(a, a)
 
 
-def read_in(class_files_dict, tree_name, particles):
+def read_in(class_files_dict, tree_name, particles, mode):
     '''
     takes in dict mapping class names to list of root files, loads them and slices them into ML format
     Args:
@@ -74,22 +74,33 @@ def read_in(class_files_dict, tree_name, particles):
         le: LabelEncoder to transform numerical y back to its string values
     '''
     
-    #convert files to pd data frames, assign key to y, concat all files
+    #convert files to pd data frames, assign key or mass to y, concat all files
+
     def _make_df(val, key):
         df = pup.root2panda(val, tree_name)
-        df['y'] = key
+        if mode == 'classification':
+            df['y'] = key
+        elif mode == 'regression':
+            try:
+                df['y'] = int(key[1:])
+            except ValueError:
+                df['y'] = 0
         return df
 
     all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)
-    
+
     X = OrderedDict()
     for particle_name, particle_info in particles.iteritems():
         logger.info('Building X_{}'.format(particle_name))
         X[particle_name] = all_events[particle_info["branches"]].values
 
-    #transform string labels to integer classes
-    le = LabelEncoder()
-    y = le.fit_transform(all_events['y'].values)
+    #transform string labels to integer classes for classification or set y for regression
+    if mode == 'classification':
+        le = LabelEncoder()
+        y = le.fit_transform(all_events['y'].values)
+    elif mode == 'regression':
+        le = None
+        y = all_events['y'].values
     
     w = all_events['yybb_weight'].values
     
diff --git a/nn_with_modes.py b/nn_with_modes.py
new file mode 100644
index 0000000..667098b
--- /dev/null
+++ b/nn_with_modes.py
@@ -0,0 +1,126 @@
+from keras.models import Sequential
+from keras.layers.core import Activation, Dense, Dropout
+from keras.layers import Masking, GRU, Merge, Input, merge
+from keras.callbacks import EarlyStopping, ModelCheckpoint
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+def train(data, mode):
+	'''
+	Args:
+		data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
+              data = {
+                "X_jet_train" : X_jet_train,
+                "X_jet_test" : X_jet_test,
+                "X_photon_train" : X_photon_train,
+                "X_photon_test" : X_photon_test,
+                "y_train" : y_train,
+                "y_test" : y_test,
+                "w_train" : w_train,
+                "w_test" : w_test
+              }
+        mode: a string specifying the type of task, either 'regression' or 'classification'
+    Returns:
+    	combine_rnn: a Sequential trained on data
+    '''
+
+	X_jet_train = data['X_jet_train']
+	X_photon_train = data['X_photon_train']
+	y_train = data['y_train']
+
+	jet_channel = Sequential()
+	photon_channel = Sequential()
+
+	JET_SHAPE = X_jet_train.shape[1:]
+	PHOTON_SHAPE = X_photon_train.shape[1:]
+
+	jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking'))
+	jet_channel.add(GRU(25, name='jet_gru'))
+	jet_channel.add(Dropout(0.3, name='jet_dropout'))
+
+	photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking'))
+	photon_channel.add(GRU(10, name='photon_gru'))
+	photon_channel.add(Dropout(0.3, name='photon_dropout'))
+
+	combined_rnn = Sequential()
+	combined_rnn.add(Merge([jet_channel, photon_channel], mode='concat'))
+	combined_rnn.add(Dense(24, activation='relu'))
+	combined_rnn.add(Dropout(0.3))
+	combined_rnn.add(Dense(12, activation='relu'))
+	combined_rnn.add(Dropout(0.3))
+	if mode == 'classification':
+		combined_rnn.add(Dense(6, activation='softmax'))
+		combined_rnn.compile('adam', 'sparse_categorical_crossentropy')
+
+	elif mode == 'regression':
+		combined_rnn.add(Dense(1))
+		combined_rnn.compile('adam', 'mae')
+
+	print 'Training:'
+	try:
+		combined_rnn.fit([X_jet_train, X_photon_train], 
+    		y_train, batch_size=16, class_weight={
+                k : (float(len(y_train)) / float(len(np.unique(y_train)) * 
+                	(len(y_train[y_train == k])))) for k in np.unique(y_train)
+     		},
+        	callbacks = [
+            	EarlyStopping(verbose=True, patience=10, monitor='val_loss'),
+          		ModelCheckpoint('./models/combinedrnn-progress',
+          		monitor='val_loss', verbose=True, save_best_only=True)
+        	],
+    	nb_epoch=30, validation_split = 0.2) 
+
+	except KeyboardInterrupt:
+		print 'Training ended early.'
+
+	return combined_rnn
+
+def test(net, data):
+	'''
+	Args:
+		net: a Sequential instance trained on data
+		data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
+              data = {
+                "X_jet_train" : X_jet_train,
+                "X_jet_test" : X_jet_test,
+                "X_photon_train" : X_photon_train,
+                "X_photon_test" : X_photon_test,
+                "y_train" : y_train,
+                "y_test" : y_test,
+                "w_train" : w_train,
+                "w_test" : w_test
+              }
+	Returns:
+		yhat_rnn: a numpy array containing the predicted values for each event
+				In the case of regression:
+				[[  28.82653809]
+ 				[ 332.62536621]
+ 				[ 343.72662354]
+ 				...,
+ 				[ 290.94213867]
+ 				[ 311.36965942]
+				[ 325.11975098]]
+
+ 				In the case of classification:
+				[[  2.98070186e-03   1.02684367e-03   6.20509265e-04   5.31344442e-04 
+				   4.20760407e-05   9.94798541e-01]
+ 				[  1.43380761e-01   2.02934369e-01   2.18192190e-01   2.09208429e-01
+    			   1.84640139e-01   4.16441038e-02]
+ 				[  1.91159040e-01   2.36048207e-01   2.16798335e-01   1.83185950e-01
+    			   1.12408176e-01   6.04002886e-02]
+				...,
+ 				[  8.16606451e-03   5.52139431e-02   1.69157043e-01   2.80651450e-01
+    			   3.87061536e-01   9.97499675e-02]
+ 				[  3.25843632e-01   2.48317569e-01   1.64540142e-01   1.18563063e-01
+    			   5.40928766e-02   8.86427015e-02]
+ 				[  3.07332397e-01   2.48623013e-01   1.71252742e-01   1.26610160e-01
+    			   6.08449057e-02   8.53367895e-02]]
+	'''
+	X_jet_test = data['X_jet_test']
+	X_photon_test = data['X_photon_test']
+	y_test= data ['y_test']
+
+	yhat_rnn = net.predict([X_jet_test, X_photon_test], verbose = True, batch_size = 512) 
+
+	return yhat_rnn
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
index 58f4c54..93bad46 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,14 +1,14 @@
 import json
 from data_processing import read_in, shuffle_split_scale, padding
+import numpy as np
 import pandautils as pup
 import cPickle
-from plotting import plot_inputs
 import utils
 import logging
-#from plotting import plot_inputs, plot_performance
-from functional_nn import train, test
+from plotting import plot_inputs, plot_confusion, plot_regression#, plot_performance
+from nn_with_modes import train, test 
 
-def main(json_config, tree_name):
+def main(json_config, mode, tree_name):
     '''
     Args:
     -----
@@ -51,7 +51,7 @@ def sha(s):
         return m.hexdigest()[:5]
 
     #-- if the pickle exists, use it
-    pickle_name = 'processed_data_' + sha(config) + '.pkl'
+    pickle_name = 'processed_data_' + sha(config) + '_' + sha(mode) + '.pkl'
     try:
         logger.info('Attempting to read from {}'.format(pickle_name))
         data = cPickle.load(open(pickle_name, 'rb'))
@@ -61,7 +61,7 @@ def sha(s):
         logger.info('Pre-processed data not found in {}'.format(pickle_name))
         logger.info('Processing data')
         # -- transform ROOT files into standard ML format (ndarrays) 
-        X, y, w, le = read_in(class_files_dict, tree_name, particles_dict)
+        X, y, w, le = read_in(class_files_dict, tree_name, particles_dict, mode)
 
         # -- shuffle, split samples into train and test set, scale features
         data = shuffle_split_scale(X, y, w) 
@@ -101,11 +101,18 @@ def sha(s):
     # # combine the outputs and process them through a bunch of FF layers
     # # use a validation split of 20%
     # # save out the weights to hdf5 and the model to yaml
-    net = train(data)
+    net = train(data, mode)
 
     # # -- test
     # # evaluate performance on the test set
     yhat = test(net, data)
+    
+    print yhat
+    # # -- plot performance by mode
+    if mode == 'regression':
+        plot_regression(yhat, data)
+    if mode == 'classification':
+        plot_confusion(yhat, data)
 
     # # -- plot performance
     # # produce ROC curves to evaluate performance
@@ -122,8 +129,12 @@ def sha(s):
     # -- read in arguments
     parser = argparse.ArgumentParser()
     parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths")
+    parser.add_argument('mode', help="classification or regression")
     parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
     args = parser.parse_args()
 
+    if args.mode != 'classification' and args.mode != 'regression':
+        raise ValueError('Mode must be classification or regression')
+
     # -- pass arguments to main
-    sys.exit(main(args.config, args.tree))
+    sys.exit(main(args.config, args.mode, args.tree))
\ No newline at end of file
diff --git a/plotting.py b/plotting.py
index 3362ab6..a5ce87d 100644
--- a/plotting.py
+++ b/plotting.py
@@ -50,13 +50,20 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl
 		color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train)))))
 
 		# -- loop through the classes
-		for k in range(len(np.unique(y_train))):
+		for k in np.unique(y_train):
 			c = next(color)
+
+			# -- in regression, le is None and we want to keep the original key
+			try:
+				transformed_k=le.inverse_transform(k)
+			except AttributeError:
+				transformed_k=k
+
 			_ = plt.hist(flat_train[y_train == k], 
 				bins=bins, 
 				histtype='step', 
 				normed=True, 
-				label='Train - ' + le.inverse_transform(k),
+				label='Train - ' + str(transformed_k),
 				weights=w_train[y_train == k],
 				color=c, 
 				linewidth=1)
@@ -64,7 +71,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl
 				bins=bins, 
 				histtype='step', 
 				normed=True,
-				label='Test  - ' + le.inverse_transform(k),
+				label='Test  - ' + str(transformed_k),
 				weights=w_test[y_test == k], 
 				color=c,
 				linewidth=2, 
@@ -151,4 +158,51 @@ def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
 	# in each class)
 	cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 	_plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
-	plt.savefig('confusion.pdf')
\ No newline at end of file
+	plt.savefig('confusion2.pdf')
+
+
+def plot_regression(yhat, data):
+	'''
+	Args:
+		yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data 
+		data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
+              data = {
+                "X_jet_train" : X_jet_train,
+                "X_jet_test" : X_jet_test,
+                "X_photon_train" : X_photon_train,
+                "X_photon_test" : X_photon_test,
+                "y_train" : y_train,
+                "y_test" : y_test,
+                "w_train" : w_train,
+                "w_test" : w_test
+              }
+	Saves:
+		'regression_test.pdf': a histogram plotting yhat containing the predicted masses
+	'''
+	
+	y_test = data['y_test'].values
+	w_test = data['w_test']
+
+	color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test)))))
+	matplotlib.rcParams.update({'font.size': 16})
+	fig = plt.figure(figsize=(11.69, 8.27), dpi=100)
+
+	bins = np.linspace(
+		min(min(yhat), min(y_test)), 
+		max(max(yhat), max(y_test)), 
+		30)
+
+	for k in np.unique(y_test):
+		c = next(color)
+		_ = plt.hist(yhat[y_test == k], 
+			bins=bins, 
+			histtype='step', 
+			normed=True, 
+			label=str(k),
+			weights=w_test[y_test == k],
+			color=c, 
+			linewidth=1)
+
+	plt.ylabel('Weighted Events')
+	plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5)
+	plt.savefig('regression_test.pdf')
\ No newline at end of file
diff --git a/utils.py b/utils.py
index e62f570..505b4d1 100644
--- a/utils.py
+++ b/utils.py
@@ -27,4 +27,9 @@ def load_config(config_file):
 			if k not in particle_info.keys():
 				raise KeyError('Particle configuration requires key: {}'.format(k))
 
+	for class_name in config['classes'].keys():
+		if not class_name.startswith('X'):
+			if class_name != 'bkg':
+				raise KeyError('Class name must start with X if it is not bkg')
+
 	return config
\ No newline at end of file