Skip to content

Commit

Permalink
add regression capability
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Jul 19, 2016
1 parent 29170ae commit a1515a8
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 19 deletions.
25 changes: 18 additions & 7 deletions data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _pairwise(iterable):
return izip(a, a)


def read_in(class_files_dict, tree_name, particles):
def read_in(class_files_dict, tree_name, particles, mode):
'''
takes in dict mapping class names to list of root files, loads them and slices them into ML format
Args:
Expand Down Expand Up @@ -74,22 +74,33 @@ def read_in(class_files_dict, tree_name, particles):
le: LabelEncoder to transform numerical y back to its string values
'''

#convert files to pd data frames, assign key to y, concat all files
#convert files to pd data frames, assign key or mass to y, concat all files

def _make_df(val, key):
df = pup.root2panda(val, tree_name)
df['y'] = key
if mode == 'classification':
df['y'] = key
elif mode == 'regression':
try:
df['y'] = int(key[1:])
except ValueError:
df['y'] = 0
return df

all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)

X = OrderedDict()
for particle_name, particle_info in particles.iteritems():
logger.info('Building X_{}'.format(particle_name))
X[particle_name] = all_events[particle_info["branches"]].values

#transform string labels to integer classes
le = LabelEncoder()
y = le.fit_transform(all_events['y'].values)
#transform string labels to integer classes for classification or set y for regression
if mode == 'classification':
le = LabelEncoder()
y = le.fit_transform(all_events['y'].values)
elif mode == 'regression':
le = None
y = all_events['y'].values

w = all_events['yybb_weight'].values

Expand Down
126 changes: 126 additions & 0 deletions nn_with_modes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout
from keras.layers import Masking, GRU, Merge, Input, merge
from keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

def train(data, mode):
'''
Args:
data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
data = {
"X_jet_train" : X_jet_train,
"X_jet_test" : X_jet_test,
"X_photon_train" : X_photon_train,
"X_photon_test" : X_photon_test,
"y_train" : y_train,
"y_test" : y_test,
"w_train" : w_train,
"w_test" : w_test
}
mode: a string specifying the type of task, either 'regression' or 'classification'
Returns:
combine_rnn: a Sequential trained on data
'''

X_jet_train = data['X_jet_train']
X_photon_train = data['X_photon_train']
y_train = data['y_train']

jet_channel = Sequential()
photon_channel = Sequential()

JET_SHAPE = X_jet_train.shape[1:]
PHOTON_SHAPE = X_photon_train.shape[1:]

jet_channel.add(Masking(mask_value=-999, input_shape=JET_SHAPE, name='jet_masking'))
jet_channel.add(GRU(25, name='jet_gru'))
jet_channel.add(Dropout(0.3, name='jet_dropout'))

photon_channel.add(Masking(mask_value=-999, input_shape=PHOTON_SHAPE, name='photon_masking'))
photon_channel.add(GRU(10, name='photon_gru'))
photon_channel.add(Dropout(0.3, name='photon_dropout'))

combined_rnn = Sequential()
combined_rnn.add(Merge([jet_channel, photon_channel], mode='concat'))
combined_rnn.add(Dense(24, activation='relu'))
combined_rnn.add(Dropout(0.3))
combined_rnn.add(Dense(12, activation='relu'))
combined_rnn.add(Dropout(0.3))
if mode == 'classification':
combined_rnn.add(Dense(6, activation='softmax'))
combined_rnn.compile('adam', 'sparse_categorical_crossentropy')

elif mode == 'regression':
combined_rnn.add(Dense(1))
combined_rnn.compile('adam', 'mae')

print 'Training:'
try:
combined_rnn.fit([X_jet_train, X_photon_train],
y_train, batch_size=16, class_weight={
k : (float(len(y_train)) / float(len(np.unique(y_train)) *
(len(y_train[y_train == k])))) for k in np.unique(y_train)
},
callbacks = [
EarlyStopping(verbose=True, patience=10, monitor='val_loss'),
ModelCheckpoint('./models/combinedrnn-progress',
monitor='val_loss', verbose=True, save_best_only=True)
],
nb_epoch=30, validation_split = 0.2)

except KeyboardInterrupt:
print 'Training ended early.'

return combined_rnn

def test(net, data):
'''
Args:
net: a Sequential instance trained on data
data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
data = {
"X_jet_train" : X_jet_train,
"X_jet_test" : X_jet_test,
"X_photon_train" : X_photon_train,
"X_photon_test" : X_photon_test,
"y_train" : y_train,
"y_test" : y_test,
"w_train" : w_train,
"w_test" : w_test
}
Returns:
yhat_rnn: a numpy array containing the predicted values for each event
In the case of regression:
[[ 28.82653809]
[ 332.62536621]
[ 343.72662354]
...,
[ 290.94213867]
[ 311.36965942]
[ 325.11975098]]
In the case of classification:
[[ 2.98070186e-03 1.02684367e-03 6.20509265e-04 5.31344442e-04
4.20760407e-05 9.94798541e-01]
[ 1.43380761e-01 2.02934369e-01 2.18192190e-01 2.09208429e-01
1.84640139e-01 4.16441038e-02]
[ 1.91159040e-01 2.36048207e-01 2.16798335e-01 1.83185950e-01
1.12408176e-01 6.04002886e-02]
...,
[ 8.16606451e-03 5.52139431e-02 1.69157043e-01 2.80651450e-01
3.87061536e-01 9.97499675e-02]
[ 3.25843632e-01 2.48317569e-01 1.64540142e-01 1.18563063e-01
5.40928766e-02 8.86427015e-02]
[ 3.07332397e-01 2.48623013e-01 1.71252742e-01 1.26610160e-01
6.08449057e-02 8.53367895e-02]]
'''
X_jet_test = data['X_jet_test']
X_photon_test = data['X_photon_test']
y_test= data ['y_test']

yhat_rnn = net.predict([X_jet_test, X_photon_test], verbose = True, batch_size = 512)

return yhat_rnn
27 changes: 19 additions & 8 deletions pipeline.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import json
from data_processing import read_in, shuffle_split_scale, padding
import numpy as np
import pandautils as pup
import cPickle
from plotting import plot_inputs
import utils
import logging
#from plotting import plot_inputs, plot_performance
from functional_nn import train, test
from plotting import plot_inputs, plot_confusion, plot_regression#, plot_performance
from nn_with_modes import train, test

def main(json_config, tree_name):
def main(json_config, mode, tree_name):
'''
Args:
-----
Expand Down Expand Up @@ -51,7 +51,7 @@ def sha(s):
return m.hexdigest()[:5]

#-- if the pickle exists, use it
pickle_name = 'processed_data_' + sha(config) + '.pkl'
pickle_name = 'processed_data_' + sha(config) + '_' + sha(mode) + '.pkl'
try:
logger.info('Attempting to read from {}'.format(pickle_name))
data = cPickle.load(open(pickle_name, 'rb'))
Expand All @@ -61,7 +61,7 @@ def sha(s):
logger.info('Pre-processed data not found in {}'.format(pickle_name))
logger.info('Processing data')
# -- transform ROOT files into standard ML format (ndarrays)
X, y, w, le = read_in(class_files_dict, tree_name, particles_dict)
X, y, w, le = read_in(class_files_dict, tree_name, particles_dict, mode)

# -- shuffle, split samples into train and test set, scale features
data = shuffle_split_scale(X, y, w)
Expand Down Expand Up @@ -101,11 +101,18 @@ def sha(s):
# # combine the outputs and process them through a bunch of FF layers
# # use a validation split of 20%
# # save out the weights to hdf5 and the model to yaml
net = train(data)
net = train(data, mode)

# # -- test
# # evaluate performance on the test set
yhat = test(net, data)

print yhat
# # -- plot performance by mode
if mode == 'regression':
plot_regression(yhat, data)
if mode == 'classification':
plot_confusion(yhat, data)

# # -- plot performance
# # produce ROC curves to evaluate performance
Expand All @@ -122,8 +129,12 @@ def sha(s):
# -- read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths")
parser.add_argument('mode', help="classification or regression")
parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
args = parser.parse_args()

if args.mode != 'classification' and args.mode != 'regression':
raise ValueError('Mode must be classification or regression')

# -- pass arguments to main
sys.exit(main(args.config, args.tree))
sys.exit(main(args.config, args.mode, args.tree))
62 changes: 58 additions & 4 deletions plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,28 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, le, particle, particl
color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train)))))

# -- loop through the classes
for k in range(len(np.unique(y_train))):
for k in np.unique(y_train):
c = next(color)

# -- in regression, le is None and we want to keep the original key
try:
transformed_k=le.inverse_transform(k)
except AttributeError:
transformed_k=k

_ = plt.hist(flat_train[y_train == k],
bins=bins,
histtype='step',
normed=True,
label='Train - ' + le.inverse_transform(k),
label='Train - ' + str(transformed_k),
weights=w_train[y_train == k],
color=c,
linewidth=1)
_ = plt.hist(flat_test[y_test == k],
bins=bins,
histtype='step',
normed=True,
label='Test - ' + le.inverse_transform(k),
label='Test - ' + str(transformed_k),
weights=w_test[y_test == k],
color=c,
linewidth=2,
Expand Down Expand Up @@ -151,4 +158,51 @@ def _plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
_plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.savefig('confusion.pdf')
plt.savefig('confusion2.pdf')


def plot_regression(yhat, data):
'''
Args:
yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data
data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
data = {
"X_jet_train" : X_jet_train,
"X_jet_test" : X_jet_test,
"X_photon_train" : X_photon_train,
"X_photon_test" : X_photon_test,
"y_train" : y_train,
"y_test" : y_test,
"w_train" : w_train,
"w_test" : w_test
}
Saves:
'regression_test.pdf': a histogram plotting yhat containing the predicted masses
'''

y_test = data['y_test'].values
w_test = data['w_test']

color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_test)))))
matplotlib.rcParams.update({'font.size': 16})
fig = plt.figure(figsize=(11.69, 8.27), dpi=100)

bins = np.linspace(
min(min(yhat), min(y_test)),
max(max(yhat), max(y_test)),
30)

for k in np.unique(y_test):
c = next(color)
_ = plt.hist(yhat[y_test == k],
bins=bins,
histtype='step',
normed=True,
label=str(k),
weights=w_test[y_test == k],
color=c,
linewidth=1)

plt.ylabel('Weighted Events')
plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5)
plt.savefig('regression_test.pdf')
5 changes: 5 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,9 @@ def load_config(config_file):
if k not in particle_info.keys():
raise KeyError('Particle configuration requires key: {}'.format(k))

for class_name in config['classes'].keys():
if not class_name.startswith('X'):
if class_name != 'bkg':
raise KeyError('Class name must start with X if it is not bkg')

return config

0 comments on commit a1515a8

Please sign in to comment.