diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1f5127d --- /dev/null +++ b/.gitignore @@ -0,0 +1,68 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# ROOT files +*.root* + +# OSX files +.DS_Store + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Ipython Notebook +.ipynb_checkpoints \ No newline at end of file diff --git a/README.md b/README.md index 8582fb6..014477e 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,22 @@ # hh2yybb Event Classifier Event level classifier for the hh-->yybb analysis using multi-stream RNNs -## Project Title: Develop a ML algorithm to distinguish SM hh (gamma-gamma bb) events from background +## RNN-based Event Level Classifier for ATLAS Analysis (hh-->yybb) ### Purpose: -ATLAS has a ‘cut based’ hh data analysis. To improve on this, a ML algoorithm is being trained to select the correct “second b jet” in single-tagged events ([bbyy jet classifier](https://github.com/jemrobinson/bbyy_jet_classifier)). The project proposed here will take this a step further and develop an algorithm for all events passing some minimum selection criteria, not just single-tagged events. +'Cut-based' methods are the prevalent data analysis strategy in ATLAS, which is also currently being used in the hh-->yybb analysis group. To improve on this, a ML algorithm is being trained to select the correct “second b jet” in single-tagged events ([bbyy jet classifier](https://github.com/jemrobinson/bbyy_jet_classifier)). The project in this repo will take this a step further and develop an event classifier. +### Current Status: +![Pipeline](images/UGsWorkflow.png) + +### Deep Learning Module: +![Net](images/MultiStreamRNN.png) +Our goal is to train an event-level classifier, in order to be able to distinguish di-Higgs resonant decays from photons+jets backgrounds. To describe an event, we combine the pre-calculated event-level information with a series of RNN streams that extract information about the event from the properties of the different types of particles in it (jets, electrons, photons, muons, ...). + +We phrase our project both as a classification and a regression. In the classification task, we consider every mass hypothesis for the di-Higgs parent particle as a separate class, and we attempt to solve a multi-class classification problem. In the regression task, we use the nominal mass of the parent particle as the continuous variable to predict. + ### To-do list: - -* Produce the ntuples using `HGamAnalysisFramework`: - Decide what info to include (jets and photons, but also leptons, pileup info?) - Apply the pre-selection
- Assign truth labels using b-quark-from-Higgs labeling scheme
- Actually make the ntuples on grid – run on signal and bkg events - -* Analysis Coding Tasks -- Modules needed: - 1. Read-in module that knows about the ntuple format - 2. Data processing module that uses `scikit-learn` to:
- scale all variables to have mean zero and sd of 1
- shuffle events
- split data into training and testing samples - 3. Plotting module to check all variables before training, both scaled and pre-scaled variables to make sure things look reasonable and there are no bugs - 4. Training module that uses `Keras` (design RNN, test different NN architectures, etc.) - 5. Testing module to check performance and produce ROC curves. Plot ROC curve as a function of mu (pile-up), pt of largest jet, Njets, etc. - -* Write presentations + 1. Testing module to check performance and produce ROC curves. Plot ROC curve as a function of mu (pile-up), pt of largest jet, Njets, etc. --- This project has been assigned to [@gstark12](https://github.com/gstark12) and [@jennyailin](https://github.com/jennyailin) as part of their Summer 2016 internship at CERN. They will work under the supervision of [@mickypaganini](https://github.com/mickypaganini) and Prof. Paul Tipton. diff --git a/config_hh.json b/config_hh.json index e0783c3..1b5ca14 100644 --- a/config_hh.json +++ b/config_hh.json @@ -62,6 +62,19 @@ "photon_topoEtcone40" ], "max_length" : 3 + }, + + "event": + { + "branches" : + [ + "jet_n", + "met", + "met_phi", + "met_sumet", + "photon_n" + ], + "max_length" : 1 } } } \ No newline at end of file diff --git a/functional_nn.py b/functional_nn.py new file mode 100644 index 0000000..97d738a --- /dev/null +++ b/functional_nn.py @@ -0,0 +1,137 @@ +from keras.layers import Masking, GRU, Input, merge, Activation, Dense, Dropout +from keras.models import Model +from keras.callbacks import EarlyStopping, ModelCheckpoint +import logging +import numpy as np +import deepdish.io as io +import os + +def train(data): + ''' + ''' + logger = logging.getLogger('Train') + + # -- extract training matrices from `data` + X_jets_train = data['X_jet_train'] + X_photons_train = data['X_photon_train'] + X_event_train = data['X_event_train'] + y_train = data['y_train'] + + # -- class weight needed if we want equal class representation + #class_weight = {k : (1./len(np.unique(y_train))) / (float(len(y_train[y_train == k])) / float(len(y_train))) for k in np.unique(y_train)} + class_weight = {k : (float(len(y_train)) / float(len(np.unique(y_train)) * (len(y_train[y_train == k])))) for k in np.unique(y_train)} + + + # -- placeholders for matrix shapes + JET_SHAPE = X_jets_train.shape[1:] + PHOTON_SHAPE = X_photons_train.shape[1:] + EVENT_SHAPE = (X_event_train.shape[1], ) + + # -- input layers (3 streams) + jet_inputs = Input(shape=JET_SHAPE) + photon_inputs = Input(shape=PHOTON_SHAPE) + event_inputs = Input(shape=EVENT_SHAPE) + + # -- jet RNN + jet_outputs = GRU(25, name='jet_gru')( + Masking(mask_value=-999, name='jet_masking')( + jet_inputs + ) + ) + + # -- photon RNN + photon_outputs = GRU(10, name='photon_gru')( + Masking(mask_value=-999, name='photon_masking')( + photon_inputs + ) + ) + + # -- merge (jet, photon) RNNs + rnn_outputs = merge([jet_outputs, photon_outputs], mode='concat') + rnn_outputs = Dense(16, activation='relu')( + #Dropout(0.3)( + #Dense(32, activation='relu')( + #Dropout(0.3)( + rnn_outputs + #) + #) + #) + ) + + # -- merge event level info as well + merged_streams = merge([rnn_outputs, event_inputs], mode='concat') + + # -- final feed-forward processing of info up to output layer with softmax + yhat = Dense(len(np.unique(y_train)), activation='softmax')( + Dropout(0.2)( + #Dense(8, activation='relu')( + #Dropout(0.2)( + Dense(8, activation='relu')( + Dropout(0.2)( + merged_streams + ) + ) + #) + #) + ) + ) + + # -- build Model from symbolic structure above + net = Model(input=[jet_inputs, photon_inputs, event_inputs], output=yhat) + net.summary() + net.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy']) + + try: + weights_path = os.path.join('weights', 'functional_combined_rnn-progress.h5') + logger.info('Trying to load weights from ' + weights_path) + net.load_weights(weights_path) + logger.info('Weights found and loaded from ' + weights_path) + except IOError: + logger.info('Pre-trained weights not found in ' + weights_path) + + logger.info('Compiling the net') + # -- train! + try: + net.fit([X_jets_train, X_photons_train, X_event_train], + y_train, batch_size=16, + class_weight=class_weight, + callbacks = [ + EarlyStopping(verbose=True, patience=20, monitor='val_loss'), + ModelCheckpoint(weights_path, + monitor='val_loss', verbose=True, save_best_only=True) + ], + nb_epoch=100, validation_split=0.2) + + except KeyboardInterrupt: + print 'Training ended early.' + + # -- load best weights back into the net + net.load_weights(weights_path) + return net + +def test(net, data): + ''' + Args: + ----- + net: a trained keras Model instance + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + Returns: + -------- + yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data + ''' + yhat = net.predict([data['X_jet_test'], data['X_photon_test'], data['X_event_test']], + verbose = True, batch_size = 512) + io.save(open(os.path.join('output','yhat.h5'), 'wb'), yhat) + # -- example of other quantities that can be evaluated from yhat + #class_predictions = [np.argmax(ev) for ev in yhat]) + return yhat \ No newline at end of file diff --git a/images/MultiStreamRNN.png b/images/MultiStreamRNN.png new file mode 100644 index 0000000..a28b04e Binary files /dev/null and b/images/MultiStreamRNN.png differ diff --git a/images/UGsWorkflow.png b/images/UGsWorkflow.png new file mode 100644 index 0000000..5cff6dc Binary files /dev/null and b/images/UGsWorkflow.png differ diff --git a/images/UGsWorkflow_v1.png b/images/UGsWorkflow_v1.png new file mode 100644 index 0000000..e942e71 Binary files /dev/null and b/images/UGsWorkflow_v1.png differ diff --git a/nn.py b/nn.py new file mode 100644 index 0000000..ddccb25 --- /dev/null +++ b/nn.py @@ -0,0 +1,66 @@ +from keras.models import Sequential +from keras.layers.core import Activation, Dense, Dropout, Lambda +from keras.layers import Masking, GRU, Merge, Input, merge +from keras.callbacks import EarlyStopping, ModelCheckpoint +import logging +import numpy as np + +def train(data): + + # -- extract training matrices from `data` + X_jets_train = data['X_jet_train'] + X_photons_train = data['X_photon_train'] + X_event_train = data['X_event_train'] + y_train = data['y_train'] + + # -- build net + jet_channel = Sequential() + photon_channel = Sequential() + event_level = Sequential() + + JET_SHAPE = X_jets_train.shape[1:] + PHOTON_SHAPE = X_photons_train.shape[1:] + EVENT_SHAPE = X_event_train.shape[1] + + jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking')) + jet_channel.add(GRU(25, name='jet_gru')) + jet_channel.add(Dropout(0.3, name='jet_dropout')) + + photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking')) + photon_channel.add(GRU(10, name='photon_gru')) + photon_channel.add(Dropout(0.3, name='photon_dropout')) + + event_level.add(Lambda(lambda x: x, input_shape=(EVENT_SHAPE, ))) + + combined_rnn = Sequential() + combined_rnn.add(Merge([jet_channel, photon_channel, event_level], mode='concat')) + combined_rnn.add(Dense(32, activation='relu')) + # combined_rnn.add(Dropout(0.3)) + # combined_rnn.add(Dense(32, activation='relu')) + # combined_rnn.add(Dropout(0.3)) + # combined_rnn.add(Dense(16, activation='relu')) + combined_rnn.add(Dropout(0.3)) + combined_rnn.add(Dense(8, activation='relu')) + combined_rnn.add(Dropout(0.3)) + combined_rnn.add(Dense(len(np.unique(y_train)), activation='softmax')) + + + + combined_rnn.compile('adam', 'sparse_categorical_crossentropy') + + logger = logging.getLogger('Train') + logger.info('Compiling the net') + try: + combined_rnn.fit([X_jets_train, X_photons_train, X_event_train], + y_train, batch_size=16, + callbacks = [ + EarlyStopping(verbose=True, patience=10, monitor='val_loss'), + ModelCheckpoint('./combinedrnn-progress', + monitor='val_loss', verbose=True, save_best_only=True) + ], + nb_epoch=30, validation_split = 0.2) + + except KeyboardInterrupt: + print 'Training ended early.' + + return combined_rnn \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 49bbdb0..b272634 100644 --- a/pipeline.py +++ b/pipeline.py @@ -11,6 +11,8 @@ #from plotting import plot_inputs, plot_performance #from nn_model import train, test +#from nn_model import train, test +#from functional_nn import train, test def main(json_config, model_name, tree_name): ''' @@ -85,11 +87,11 @@ def sha(s): Plots should be saved out a pdf with informative names ''' logger.info('Saving input distributions in ./plots/') - plot_inputs(data, particles_dict.keys()) + plot_inputs(data, particles_dict) logger.info('Padding') for key in data: - if key.startswith('X_'): + if ((key.startswith('X_')) and ('event' not in key)): # no padding for `event` matrix data[key] = padding(data[key], particles_dict[key.split('_')[1]]['max_length']) # ^ assuming naming convention: X__train, X__test @@ -110,6 +112,7 @@ def sha(s): # # -- test # # evaluate performance on the test set yhat=NN_test(net, data) + # # -- plot performance #plot_NN(yhat, data) diff --git a/plotting.py b/plotting.py index 37a98a9..ed03fcd 100644 --- a/plotting.py +++ b/plotting.py @@ -7,8 +7,9 @@ from sklearn.preprocessing import LabelEncoder from viz import calculate_roc, ROC_plotter, add_curve import cPickle +from sklearn.metrics import confusion_matrix -def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, le, feature): +def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particles_dict): ''' Args: train: ndarray [n_ev_train, n_muon_feat] containing the events allocated for training @@ -19,65 +20,69 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, le, feature) w_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing varlist: list of names of branches like 'jet_px', 'photon_E', 'muon_Iso' le: LabelEncoder to transform numerical y back to its string values - feature: a string like 'Jet', 'Muon', 'Photon' + particle: a string like 'jet', 'muon', 'photon', ... + particles_dict: Returns: Saves .pdf histograms for each feature-related branch plotting the training and test sets for each class ''' # -- extend w and y arrays to match the total number of particles per event - w_train_ext = np.array(pup.flatten([[w] * (len(train[i, 0])) for i, w in enumerate(w_train)])) - w_test_ext = np.array(pup.flatten([[w] * (len(test[i, 0])) for i, w in enumerate(w_test)])) - y_train_ext = np.array(pup.flatten([[y] * (len(train[i, 0])) for i, y in enumerate(y_train)])) - y_test_ext = np.array(pup.flatten([[y] * (len(test[i, 0])) for i, y in enumerate(y_test)])) + try: + w_train = np.array(pup.flatten([[w] * (len(train[i, 0])) for i, w in enumerate(w_train)])) + w_test = np.array(pup.flatten([[w] * (len(test[i, 0])) for i, w in enumerate(w_test)])) + y_train = np.array(pup.flatten([[y] * (len(train[i, 0])) for i, y in enumerate(y_train)])) + y_test = np.array(pup.flatten([[y] * (len(test[i, 0])) for i, y in enumerate(y_test)])) + except TypeError: # `event` has a different structure that does not require all this + pass + + varlist = particles_dict[particle]['branches'] - # -- we keep a column counter because `varlist` contains all variables for all particles, - # but each X matrix only contains as many columns as the number of variables - # related to that specific paritcle type - column_counter = 0 # -- loop through the variables - for key in varlist: - if key.startswith(feature): - flat_train = pup.flatten(train[:, column_counter]) - flat_test = pup.flatten(test[:, column_counter]) - matplotlib.rcParams.update({'font.size': 16}) - fig = plt.figure(figsize=(11.69, 8.27), dpi=100) + for column_counter, key in enumerate(varlist): + + flat_train = pup.flatten(train[:, column_counter]) + flat_test = pup.flatten(test[:, column_counter]) + + matplotlib.rcParams.update({'font.size': 16}) + fig = plt.figure(figsize=(11.69, 8.27), dpi=100) + + bins = np.linspace( + min(min(flat_train), min(flat_test)), + max(max(flat_train), max(flat_test)), + 30) + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train))))) + + # -- loop through the classes + for k in range(len(np.unique(y_train))): + c = next(color) + _ = plt.hist(flat_train[y_train == k], + bins=bins, + histtype='step', + normed=True, + label='Train - ' + le.inverse_transform(k), + weights=w_train[y_train == k], + color=c, + linewidth=1) + _ = plt.hist(flat_test[y_test == k], + bins=bins, + histtype='step', + normed=True, + label='Test - ' + le.inverse_transform(k), + weights=w_test[y_test == k], + color=c, + linewidth=2, + linestyle='dashed') - bins = np.linspace( - min(min(flat_train), min(flat_test)), - max(max(flat_train), max(flat_test)), - 30) - color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train))))) - # -- loop through the classes - for k in range(len(np.unique(y_train))): - c = next(color) - _ = plt.hist(flat_train[y_train_ext == k], - bins=bins, - histtype='step', - normed=True, - label='Train - ' + le.inverse_transform(k), - weights=w_train_ext[y_train_ext == k], - color=c, - linewidth=1) - _ = plt.hist(flat_test[y_test_ext == k], - bins=bins, - histtype='step', - normed=True, - label='Test - ' + le.inverse_transform(k), - weights=w_test_ext[y_test_ext == k], - color=c, - linewidth=2, - linestyle='dashed') - plt.title(key) - plt.yscale('log') - plt.ylabel('Weighted Events') - plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5) - try: - plt.savefig(os.path.join('plots', key + '.pdf')) - except IOError: - os.makedirs('plots') - plt.savefig(os.path.join('plots', key + '.pdf')) - column_counter += 1 + plt.title(key) + plt.yscale('log') + plt.ylabel('Weighted Events') + plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5) + try: + plt.savefig(os.path.join('plots', key + '.pdf')) + except IOError: + os.makedirs('plots') + plt.savefig(os.path.join('plots', key + '.pdf')) -def plot_inputs(data, particle_names): +def plot_inputs(data, particles_dict): ''' Args: data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: @@ -91,13 +96,14 @@ def plot_inputs(data, particle_names): "w_train" : w_train, "w_test" : w_test } - particle_names: list of strings, names of particle streams + #particle_names: list of strings, names of particle streams + particles_dict: Returns: Saves .pdf histograms plotting the training and test sets of each class for each feature ''' - for particle in particle_names: + for particle in particles_dict.keys(): _plot_X( data['X_' + particle + '_train'], data['X_' + particle + '_test'], @@ -105,10 +111,11 @@ def plot_inputs(data, particle_names): data['y_test'], data['w_train'], data['w_test'], - data['varlist'], data['LabelEncoder'], - particle + particle, + particles_dict ) + def plot_NN(yhat, data): ''' Args: @@ -173,3 +180,41 @@ def plot_roc_Curve(yhat, data, le, model_name): fig.savefig('/Users/gigifstark/CERN_Work/HH2YYBB/roc'+ str(k)+'.pdf') cPickle.dump(pkl_dict, open(model_name+"_pkl", 'wb')) +def plot_confusion(yhat, data): + ''' + Args: + yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + Returns: + Saves confusion.pdf confusion matrix + ''' + + y_test = data['y_test'] + + def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(np.unique(y_test))) + plt.xticks(tick_marks, sorted(np.unique(y_test))) + plt.yticks(tick_marks, sorted(np.unique(y_test))) + plt.tight_layout() + plt.ylabel('True label') + plt.xlabel('Predicted label') + + cm = confusion_matrix(y_test, np.argmax(yhat, axis=1)) + # Normalize the confusion matrix by row (i.e by the number of samples + # in each class) + cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + _plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') + plt.savefig('confusion.pdf')