From a1515a84e0200db4362b0fd5cbd729bbf6adc51f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 19 Jul 2016 16:04:11 +0000 Subject: [PATCH] add regression capability --- data_processing.py | 25 ++++++--- nn_with_modes.py | 126 +++++++++++++++++++++++++++++++++++++++++++++ pipeline.py | 27 +++++++--- plotting.py | 62 ++++++++++++++++++++-- utils.py | 5 ++ 5 files changed, 226 insertions(+), 19 deletions(-) create mode 100644 nn_with_modes.py diff --git a/data_processing.py b/data_processing.py index 8275b03..f5a348f 100644 --- a/data_processing.py +++ b/data_processing.py @@ -17,7 +17,7 @@ def _pairwise(iterable): return izip(a, a) -def read_in(class_files_dict, tree_name, particles): +def read_in(class_files_dict, tree_name, particles, mode): ''' takes in dict mapping class names to list of root files, loads them and slices them into ML format Args: @@ -74,22 +74,33 @@ def read_in(class_files_dict, tree_name, particles): le: LabelEncoder to transform numerical y back to its string values ''' - #convert files to pd data frames, assign key to y, concat all files + #convert files to pd data frames, assign key or mass to y, concat all files + def _make_df(val, key): df = pup.root2panda(val, tree_name) - df['y'] = key + if mode == 'classification': + df['y'] = key + elif mode == 'regression': + try: + df['y'] = int(key[1:]) + except ValueError: + df['y'] = 0 return df all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True) - + X = OrderedDict() for particle_name, particle_info in particles.iteritems(): logger.info('Building X_{}'.format(particle_name)) X[particle_name] = all_events[particle_info["branches"]].values - #transform string labels to integer classes - le = LabelEncoder() - y = le.fit_transform(all_events['y'].values) + #transform string labels to integer classes for classification or set y for regression + if mode == 'classification': + le = LabelEncoder() + y = le.fit_transform(all_events['y'].values) + elif mode == 'regression': + le = None + y = all_events['y'].values w = all_events['yybb_weight'].values diff --git a/nn_with_modes.py b/nn_with_modes.py new file mode 100644 index 0000000..667098b --- /dev/null +++ b/nn_with_modes.py @@ -0,0 +1,126 @@ +from keras.models import Sequential +from keras.layers.core import Activation, Dense, Dropout +from keras.layers import Masking, GRU, Merge, Input, merge +from keras.callbacks import EarlyStopping, ModelCheckpoint +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +def train(data, mode): + ''' + Args: + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + mode: a string specifying the type of task, either 'regression' or 'classification' + Returns: + combine_rnn: a Sequential trained on data + ''' + + X_jet_train = data['X_jet_train'] + X_photon_train = data['X_photon_train'] + y_train = data['y_train'] + + jet_channel = Sequential() + photon_channel = Sequential() + + JET_SHAPE = X_jet_train.shape[1:] + PHOTON_SHAPE = X_photon_train.shape[1:] + + jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking')) + jet_channel.add(GRU(25, name='jet_gru')) + jet_channel.add(Dropout(0.3, name='jet_dropout')) + + photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking')) + photon_channel.add(GRU(10, name='photon_gru')) + photon_channel.add(Dropout(0.3, name='photon_dropout')) + + combined_rnn = Sequential() + combined_rnn.add(Merge([jet_channel, photon_channel], mode='concat')) + combined_rnn.add(Dense(24, activation='relu')) + combined_rnn.add(Dropout(0.3)) + combined_rnn.add(Dense(12, activation='relu')) + combined_rnn.add(Dropout(0.3)) + if mode == 'classification': + combined_rnn.add(Dense(6, activation='softmax')) + combined_rnn.compile('adam', 'sparse_categorical_crossentropy') + + elif mode == 'regression': + combined_rnn.add(Dense(1)) + combined_rnn.compile('adam', 'mae') + + print 'Training:' + try: + combined_rnn.fit([X_jet_train, X_photon_train], + y_train, batch_size=16, class_weight={ + k : (float(len(y_train)) / float(len(np.unique(y_train)) * + (len(y_train[y_train == k])))) for k in np.unique(y_train) + }, + callbacks = [ + EarlyStopping(verbose=True, patience=10, monitor='val_loss'), + ModelCheckpoint('./models/combinedrnn-progress', + monitor='val_loss', verbose=True, save_best_only=True) + ], + nb_epoch=30, validation_split = 0.2) + + except KeyboardInterrupt: + print 'Training ended early.' + + return combined_rnn + +def test(net, data): + ''' + Args: + net: a Sequential instance trained on data + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + Returns: + yhat_rnn: a numpy array containing the predicted values for each event + In the case of regression: + [[ 28.82653809] + [ 332.62536621] + [ 343.72662354] + ..., + [ 290.94213867] + [ 311.36965942] + [ 325.11975098]] + + In the case of classification: + [[ 2.98070186e-03 1.02684367e-03 6.20509265e-04 5.31344442e-04 + 4.20760407e-05 9.94798541e-01] + [ 1.43380761e-01 2.02934369e-01 2.18192190e-01 2.09208429e-01 + 1.84640139e-01 4.16441038e-02] + [ 1.91159040e-01 2.36048207e-01 2.16798335e-01 1.83185950e-01 + 1.12408176e-01 6.04002886e-02] + ..., + [ 8.16606451e-03 5.52139431e-02 1.69157043e-01 2.80651450e-01 + 3.87061536e-01 9.97499675e-02] + [ 3.25843632e-01 2.48317569e-01 1.64540142e-01 1.18563063e-01 + 5.40928766e-02 8.86427015e-02] + [ 3.07332397e-01 2.48623013e-01 1.71252742e-01 1.26610160e-01 + 6.08449057e-02 8.53367895e-02]] + ''' + X_jet_test = data['X_jet_test'] + X_photon_test = data['X_photon_test'] + y_test= data ['y_test'] + + yhat_rnn = net.predict([X_jet_test, X_photon_test], verbose = True, batch_size = 512) + + return yhat_rnn \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 58f4c54..93bad46 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,14 +1,14 @@ import json from data_processing import read_in, shuffle_split_scale, padding +import numpy as np import pandautils as pup import cPickle -from plotting import plot_inputs import utils import logging -#from plotting import plot_inputs, plot_performance -from functional_nn import train, test +from plotting import plot_inputs, plot_confusion, plot_regression#, plot_performance +from nn_with_modes import train, test -def main(json_config, tree_name): +def main(json_config, mode, tree_name): ''' Args: ----- @@ -51,7 +51,7 @@ def sha(s): return m.hexdigest()[:5] #-- if the pickle exists, use it - pickle_name = 'processed_data_' + sha(config) + '.pkl' + pickle_name = 'processed_data_' + sha(config) + '_' + sha(mode) + '.pkl' try: logger.info('Attempting to read from {}'.format(pickle_name)) data = cPickle.load(open(pickle_name, 'rb')) @@ -61,7 +61,7 @@ def sha(s): logger.info('Pre-processed data not found in {}'.format(pickle_name)) logger.info('Processing data') # -- transform ROOT files into standard ML format (ndarrays) - X, y, w, le = read_in(class_files_dict, tree_name, particles_dict) + X, y, w, le = read_in(class_files_dict, tree_name, particles_dict, mode) # -- shuffle, split samples into train and test set, scale features data = shuffle_split_scale(X, y, w) @@ -101,11 +101,18 @@ def sha(s): # # combine the outputs and process them through a bunch of FF layers # # use a validation split of 20% # # save out the weights to hdf5 and the model to yaml - net = train(data) + net = train(data, mode) # # -- test # # evaluate performance on the test set yhat = test(net, data) + + print yhat + # # -- plot performance by mode + if mode == 'regression': + plot_regression(yhat, data) + if mode == 'classification': + plot_confusion(yhat, data) # # -- plot performance # # produce ROC curves to evaluate performance @@ -122,8 +129,12 @@ def sha(s): # -- read in arguments parser = argparse.ArgumentParser() parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths") + parser.add_argument('mode', help="classification or regression") parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini') args = parser.parse_args() + if args.mode != 'classification' and args.mode != 'regression': + raise ValueError('Mode must be classification or regression') + # -- pass arguments to main - sys.exit(main(args.config, args.tree)) + sys.exit(main(args.config, args.mode, args.tree)) \ No newline at end of file diff --git a/plotting.py b/plotting.py index 3362ab6..a5ce87d 100644 --- a/plotting.py +++ b/plotting.py @@ -50,13 +50,20 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train))))) # -- loop through the classes - for k in range(len(np.unique(y_train))): + for k in np.unique(y_train): c = next(color) + + # -- in regression, le is None and we want to keep the original key + try: + transformed_k=le.inverse_transform(k) + except AttributeError: + transformed_k=k + _ = plt.hist(flat_train[y_train == k], bins=bins, histtype='step', normed=True, - label='Train - ' + le.inverse_transform(k), + label='Train - ' + str(transformed_k), weights=w_train[y_train == k], color=c, linewidth=1) @@ -64,7 +71,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl bins=bins, histtype='step', normed=True, - label='Test - ' + le.inverse_transform(k), + label='Test - ' + str(transformed_k), weights=w_test[y_test == k], color=c, linewidth=2, @@ -151,4 +158,51 @@ def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): # in each class) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] _plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') - plt.savefig('confusion.pdf') \ No newline at end of file + plt.savefig('confusion2.pdf') + + +def plot_regression(yhat, data): + ''' + Args: + yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + Saves: + 'regression_test.pdf': a histogram plotting yhat containing the predicted masses + ''' + + y_test = data['y_test'].values + w_test = data['w_test'] + + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test))))) + matplotlib.rcParams.update({'font.size': 16}) + fig = plt.figure(figsize=(11.69, 8.27), dpi=100) + + bins = np.linspace( + min(min(yhat), min(y_test)), + max(max(yhat), max(y_test)), + 30) + + for k in np.unique(y_test): + c = next(color) + _ = plt.hist(yhat[y_test == k], + bins=bins, + histtype='step', + normed=True, + label=str(k), + weights=w_test[y_test == k], + color=c, + linewidth=1) + + plt.ylabel('Weighted Events') + plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5) + plt.savefig('regression_test.pdf') \ No newline at end of file diff --git a/utils.py b/utils.py index e62f570..505b4d1 100644 --- a/utils.py +++ b/utils.py @@ -27,4 +27,9 @@ def load_config(config_file): if k not in particle_info.keys(): raise KeyError('Particle configuration requires key: {}'.format(k)) + for class_name in config['classes'].keys(): + if not class_name.startswith('X'): + if class_name != 'bkg': + raise KeyError('Class name must start with X if it is not bkg') + return config \ No newline at end of file