for trial 2

YaleATLAS · Jul 13, 2016 · fbedd78 · fbedd78
2 parents 769601f + 29170ae
commit fbedd78
Show file tree

Hide file tree

Showing 10 changed files with 402 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,68 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# ROOT files
+*.root*
+
+# OSX files
+.DS_Store
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Ipython Notebook
+.ipynb_checkpoints
diff --git a/README.md b/README.md
@@ -1,30 +1,22 @@
 # hh2yybb Event Classifier
 Event level classifier for the hh-->yybb analysis using multi-stream RNNs
 
-## Project Title: Develop a ML algorithm to distinguish SM hh (gamma-gamma bb) events from background 
+## RNN-based Event Level Classifier for ATLAS Analysis (hh-->yybb) 
 
 ### Purpose: 
-ATLAS has a ‘cut based’ hh data analysis. To improve on this, a ML algoorithm is being trained to select the correct “second b jet” in single-tagged events ([bbyy jet classifier](https://github.com/jemrobinson/bbyy_jet_classifier)). The project proposed here will take this a step further and develop an algorithm for all events passing some minimum selection criteria, not just single-tagged events. 
+'Cut-based' methods are the prevalent data analysis strategy in ATLAS, which is also currently being used in the hh-->yybb analysis group. To improve on this, a ML algorithm is being trained to select the correct “second b jet” in single-tagged events ([bbyy jet classifier](https://github.com/jemrobinson/bbyy_jet_classifier)). The project in this repo will take this a step further and develop an event classifier. 
 
+### Current Status:
+![Pipeline](images/UGsWorkflow.png)
+
+### Deep Learning Module:
+![Net](images/MultiStreamRNN.png)
+Our goal is to train an event-level classifier, in order to be able to distinguish di-Higgs resonant decays from photons+jets backgrounds. To describe an event, we combine the pre-calculated event-level information with a series of RNN streams that extract information about the event from the properties of the different types of particles in it (jets, electrons, photons, muons, ...).
+
+We phrase our project both as a classification and a regression. In the classification task, we consider every mass hypothesis for the di-Higgs parent particle as a separate class, and we attempt to solve a multi-class classification problem. In the regression task, we use the nominal mass of the parent particle as the continuous variable to predict.
+
 ### To-do list:
-
-* Produce the ntuples using `HGamAnalysisFramework`:  
-     Decide what info to include (jets and photons, but also leptons, pileup info?)   
-     Apply the pre-selection </br>
-     Assign truth labels using b-quark-from-Higgs labeling scheme </br>
-     Actually make the ntuples on grid – run on signal and bkg events
-
-* Analysis Coding Tasks -- Modules needed: 
-   1. Read-in module that knows about the ntuple format 
-   2. Data processing module that uses `scikit-learn` to: </br>
-      scale all variables to have mean zero and sd of 1 </br>
-      shuffle events  </br>
-      split data into training and testing samples 
-   3. Plotting module to check all variables before training, both scaled and pre-scaled variables to make sure things look reasonable and there are no bugs 
-   4. Training module that uses `Keras` (design RNN, test different NN architectures, etc.) 
-   5. Testing module to check performance and produce ROC curves. Plot ROC curve as a function of mu (pile-up), pt of largest jet, Njets, etc. 
-
-* Write presentations
+ 1. Testing module to check performance and produce ROC curves. Plot ROC curve as a function of mu (pile-up), pt of largest jet, Njets, etc. 
 
 ---
 This project has been assigned to [@gstark12](https://github.com/gstark12) and [@jennyailin](https://github.com/jennyailin) as part of their Summer 2016 internship at CERN. They will work under the supervision of [@mickypaganini](https://github.com/mickypaganini) and Prof. Paul Tipton.
diff --git a/config_hh.json b/config_hh.json
@@ -62,6 +62,19 @@
                     "photon_topoEtcone40"
                 ],
             "max_length" : 3
+        },
+
+        "event":
+        {
+            "branches" :
+                [
+                    "jet_n",
+                    "met",
+                    "met_phi",
+                    "met_sumet",
+                    "photon_n"
+                ],
+            "max_length" : 1
         }
     }
 }
diff --git a/functional_nn.py b/functional_nn.py
@@ -0,0 +1,137 @@
+from keras.layers import Masking, GRU, Input, merge, Activation, Dense, Dropout
+from keras.models import Model
+from keras.callbacks import EarlyStopping, ModelCheckpoint
+import logging
+import numpy as np
+import deepdish.io as io
+import os
+
+def train(data):
+	'''
+	'''
+	logger = logging.getLogger('Train')
+
+	# -- extract training matrices from `data`
+	X_jets_train = data['X_jet_train']
+	X_photons_train = data['X_photon_train']
+	X_event_train = data['X_event_train']
+	y_train = data['y_train']
+
+	# -- class weight needed if we want equal class representation
+	#class_weight = {k : (1./len(np.unique(y_train))) / (float(len(y_train[y_train == k])) / float(len(y_train))) for k in np.unique(y_train)}
+	class_weight = {k : (float(len(y_train)) / float(len(np.unique(y_train)) * (len(y_train[y_train == k])))) for k in np.unique(y_train)}
+
+
+	# -- placeholders for matrix shapes
+	JET_SHAPE = X_jets_train.shape[1:]
+	PHOTON_SHAPE = X_photons_train.shape[1:]
+	EVENT_SHAPE = (X_event_train.shape[1], )
+
+	# -- input layers (3 streams)
+	jet_inputs = Input(shape=JET_SHAPE)
+	photon_inputs = Input(shape=PHOTON_SHAPE)
+	event_inputs = Input(shape=EVENT_SHAPE)
+
+	# -- jet RNN
+	jet_outputs = GRU(25, name='jet_gru')(
+				  	  Masking(mask_value=-999, name='jet_masking')(
+					  	  jet_inputs
+				  	  )
+			  	  )
+
+	# -- photon RNN
+	photon_outputs = GRU(10, name='photon_gru')(
+					 	Masking(mask_value=-999, name='photon_masking')(
+					    	photon_inputs
+					  	)
+				  	 )
+
+	# -- merge (jet, photon) RNNs
+	rnn_outputs = merge([jet_outputs, photon_outputs], mode='concat')
+	rnn_outputs = Dense(16, activation='relu')(
+					#Dropout(0.3)(
+						#Dense(32, activation='relu')(
+							#Dropout(0.3)(
+								rnn_outputs
+							#)
+						#)
+					#)
+				  )
+
+	# -- merge event level info as well
+	merged_streams = merge([rnn_outputs, event_inputs], mode='concat')
+
+	# -- final feed-forward processing of info up to output layer with softmax
+	yhat = Dense(len(np.unique(y_train)), activation='softmax')(
+			Dropout(0.2)(
+				#Dense(8, activation='relu')(
+					#Dropout(0.2)(
+						Dense(8, activation='relu')(
+							Dropout(0.2)(
+								merged_streams
+							)
+						)
+					#)
+				#)
+			)
+		   )
+
+	# -- build Model from symbolic structure above
+	net = Model(input=[jet_inputs, photon_inputs, event_inputs], output=yhat)
+	net.summary()
+	net.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])
+
+	try:
+		weights_path = os.path.join('weights', 'functional_combined_rnn-progress.h5')
+		logger.info('Trying to load weights from ' + weights_path)
+		net.load_weights(weights_path)
+		logger.info('Weights found and loaded from ' + weights_path)
+	except IOError:
+		logger.info('Pre-trained weights not found in ' + weights_path)
+
+	logger.info('Compiling the net')
+	# -- train!
+	try:
+		net.fit([X_jets_train, X_photons_train, X_event_train], 
+    		y_train, batch_size=16, 
+    		class_weight=class_weight,
+        	callbacks = [
+            	EarlyStopping(verbose=True, patience=20, monitor='val_loss'),
+          		ModelCheckpoint(weights_path,
+          		monitor='val_loss', verbose=True, save_best_only=True)
+        	],
+    	nb_epoch=100, validation_split=0.2) 
+
+	except KeyboardInterrupt:
+		print 'Training ended early.'
+
+	# -- load best weights back into the net
+	net.load_weights(weights_path)
+	return net
+
+def test(net, data):
+	'''
+	Args:
+	-----
+		net: a trained keras Model instance
+		data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
+              data = {
+                "X_jet_train" : X_jet_train,
+                "X_jet_test" : X_jet_test,
+                "X_photon_train" : X_photon_train,
+                "X_photon_test" : X_photon_test,
+                "y_train" : y_train,
+                "y_test" : y_test,
+                "w_train" : w_train,
+                "w_test" : w_test
+              }
+    Returns:
+    --------
+    	yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data 
+	'''
+	yhat = net.predict([data['X_jet_test'], data['X_photon_test'], data['X_event_test']], 
+		verbose = True, batch_size = 512)
+	io.save(open(os.path.join('output','yhat.h5'), 'wb'), yhat)
+	# -- example of other quantities that can be evaluated from yhat
+	#class_predictions = [np.argmax(ev) for ev in yhat])
+	return yhat
diff --git a/images/MultiStreamRNN.png b/images/MultiStreamRNN.png
diff --git a/images/UGsWorkflow.png b/images/UGsWorkflow.png
diff --git a/images/UGsWorkflow_v1.png b/images/UGsWorkflow_v1.png
diff --git a/nn.py b/nn.py
@@ -0,0 +1,66 @@
+from keras.models import Sequential
+from keras.layers.core import Activation, Dense, Dropout, Lambda
+from keras.layers import Masking, GRU, Merge, Input, merge
+from keras.callbacks import EarlyStopping, ModelCheckpoint
+import logging
+import numpy as np
+
+def train(data):
+
+	# -- extract training matrices from `data`
+	X_jets_train = data['X_jet_train']
+	X_photons_train = data['X_photon_train']
+	X_event_train = data['X_event_train']
+	y_train = data['y_train']
+
+	# -- build net
+	jet_channel = Sequential()
+	photon_channel = Sequential()
+	event_level = Sequential()
+
+	JET_SHAPE = X_jets_train.shape[1:]
+	PHOTON_SHAPE = X_photons_train.shape[1:]
+	EVENT_SHAPE = X_event_train.shape[1]
+
+	jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking'))
+	jet_channel.add(GRU(25, name='jet_gru'))
+	jet_channel.add(Dropout(0.3, name='jet_dropout'))
+
+	photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking'))
+	photon_channel.add(GRU(10, name='photon_gru'))
+	photon_channel.add(Dropout(0.3, name='photon_dropout'))
+
+	event_level.add(Lambda(lambda x: x, input_shape=(EVENT_SHAPE, )))
+
+	combined_rnn = Sequential()
+	combined_rnn.add(Merge([jet_channel, photon_channel, event_level], mode='concat'))	
+	combined_rnn.add(Dense(32, activation='relu'))
+	# combined_rnn.add(Dropout(0.3))
+	# combined_rnn.add(Dense(32, activation='relu'))
+	# combined_rnn.add(Dropout(0.3))
+	# combined_rnn.add(Dense(16, activation='relu'))
+	combined_rnn.add(Dropout(0.3))
+	combined_rnn.add(Dense(8, activation='relu'))
+	combined_rnn.add(Dropout(0.3))
+	combined_rnn.add(Dense(len(np.unique(y_train)), activation='softmax'))
+
+
+
+	combined_rnn.compile('adam', 'sparse_categorical_crossentropy')
+
+	logger = logging.getLogger('Train')
+	logger.info('Compiling the net')
+	try:
+		combined_rnn.fit([X_jets_train, X_photons_train, X_event_train], 
+    		y_train, batch_size=16, 
+        	callbacks = [
+            	EarlyStopping(verbose=True, patience=10, monitor='val_loss'),
+          		ModelCheckpoint('./combinedrnn-progress',
+          		monitor='val_loss', verbose=True, save_best_only=True)
+        	],
+    	nb_epoch=30, validation_split = 0.2) 
+
+	except KeyboardInterrupt:
+		print 'Training ended early.'
+
+	return combined_rnn
diff --git a/pipeline.py b/pipeline.py
@@ -11,6 +11,8 @@
 #from plotting import plot_inputs, plot_performance
 
 #from nn_model import train, test
+#from nn_model import train, test
+#from functional_nn import train, test
 
 def main(json_config, model_name, tree_name):
     '''
@@ -85,11 +87,11 @@ def sha(s):
         Plots should be saved out a pdf with informative names
         '''
         logger.info('Saving input distributions in ./plots/')
-        plot_inputs(data, particles_dict.keys())
+        plot_inputs(data, particles_dict)
 
         logger.info('Padding')
         for key in data:
-            if key.startswith('X_'):
+            if ((key.startswith('X_')) and ('event' not in key)): # no padding for `event` matrix
                 data[key] = padding(data[key], particles_dict[key.split('_')[1]]['max_length']) 
                 # ^ assuming naming convention: X_<particle>_train, X_<particle>_test 
 
@@ -110,6 +112,7 @@ def sha(s):
     # # -- test
     # # evaluate performance on the test set
     yhat=NN_test(net, data)
+
 
     # # -- plot performance
     #plot_NN(yhat, data)