From 2d599128fa55c9133bfdd031ca34e50e544b4952 Mon Sep 17 00:00:00 2001 From: Michela Paganini Date: Wed, 6 Jul 2016 02:06:29 +0200 Subject: [PATCH] Refactoring to generalize to N streams --- data_processing.py | 72 ++++++++++++++++++++------------------- pipeline.py | 85 +++++++++++++++------------------------------- plotting.py | 26 ++++++++++---- utils.py | 14 +++++++- 4 files changed, 98 insertions(+), 99 deletions(-) diff --git a/data_processing.py b/data_processing.py index 378cd4c..4dd1256 100644 --- a/data_processing.py +++ b/data_processing.py @@ -5,8 +5,18 @@ from sklearn.cross_validation import train_test_split import pandautils as pup import warnings +import logging +from collections import OrderedDict +from itertools import izip -def _build_X(events, phrase, exclude_vars): +logger = logging.getLogger('data_processing') + +def _pairwise(iterable): + "s -> (s0, s1), (s2, s3), (s4, s5), ..." + a = iter(iterable) + return izip(a, a) + +def _build_X(events, particle_branches): '''slices related branches into a numpy array Args: events: a pandas DataFrame containing the complete data by event @@ -14,12 +24,11 @@ def _build_X(events, phrase, exclude_vars): Returns: output_array: a numpy array containing data only pertaining to the related branches ''' - branch_names = [key for key in events.keys() if (key.startswith(phrase) and (key not in exclude_vars))] - sliced_events = events[branch_names].as_matrix() - return sliced_events, branch_names + sliced_events = events[particle_branches].values + return sliced_events -def read_in(class_files_dict, exclude_vars): +def read_in(class_files_dict, tree_name, streams): ''' takes in dict mapping class names to list of root files, loads them and slices them into ML format Args: @@ -40,7 +49,8 @@ def read_in(class_files_dict, exclude_vars): ], ... } - exclude_vars: list of strings of names of branches not to be used for training + tree_name: string, name of the tree to open in the ntuples + Returns: X_jets: ndarray [n_ev, n_jet_feat] containing jet related branches X_photons: ndarray [n_ev, n_photon_feat] containing photon related branches @@ -54,24 +64,26 @@ def read_in(class_files_dict, exclude_vars): #convert files to pd data frames, assign key to y, concat all files def _make_df(val, key): - df = pup.root2panda(val, 'events') + df = pup.root2panda(val, tree_name) df['y'] = key return df all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True) - - #slice related branches - X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars) - X_photons, photon_branches = _build_X(all_events, 'Photon', exclude_vars) - X_muons, muon_branches = _build_X(all_events, 'Muon', exclude_vars) + X = OrderedDict() + for stream_name, stream_info in streams.iteritems(): + logger.info('building X_{}'.format(stream_name)) + X[stream_name] = _build_X(all_events, stream_info["branches"]) + #transform string labels to integer classes le = LabelEncoder() y = le.fit_transform(all_events['y'].values) - w = all_events['EventWeight'].values + #w = all_events['eventWeight'].values + w = all_events['yybb_weight'].values - return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches + return X, y, w + # return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches def _scale(matrix_train, matrix_test): @@ -99,7 +111,8 @@ def _scale(matrix_train, matrix_test): return matrix_train, matrix_test -def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): +#def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): +def shuffle_split_scale(X, y, w): ''' takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets Args: @@ -109,29 +122,20 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): y: ndarray [n_ev, 1] containing the truth labels w: ndarray [n_ev, 1] containing EventWeights Returns: - X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the events of jet related branches allocated for training - X_jets_test: ndarray [n_ev_test, n_jet_feat] containing the events of jet related branches allocated for testing - X_photons_train: ndarray [n_ev_train, n_photon_feat] containing the events of photon related branches allocated for training - X_photons_test: ndarray [n_ev_test, n_photon_feat] containing the events of photon related branches allocated for testing - X_muons_train: ndarray [n_ev_train, n_muon_feat] containing the events of muon related branches allocated for training - X_muons_test: ndarray [n_ev_test, n_muon_feat] containing the events of muon related branches allocated for testing - Y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training - Y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing - W_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training - W_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing + ''' #shuffle events & split into testing and training sets - X_jets_train, X_jets_test, \ - X_photons_train, X_photons_test, \ - X_muons_train, X_muons_test, \ - Y_train, Y_test, \ - W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4) + logger.info('shuffling, splitting and scaling X') + + data_tuple = train_test_split(*(X.values() + [y, w]), test_size=0.4) - X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test) - X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test) - X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test) + data = OrderedDict() + for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])): + data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test) - return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test + data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:] + + return data def padding(X, max_length, value=-999): diff --git a/pipeline.py b/pipeline.py index 4eb4e2c..2186752 100644 --- a/pipeline.py +++ b/pipeline.py @@ -8,7 +8,7 @@ #from plotting import plot_inputs, plot_performance #from nn_model import train, test -def main(json_config, exclude_vars): +def main(json_config, tree_name): ''' Args: ----- @@ -29,7 +29,7 @@ def main(json_config, exclude_vars): ], ... } - exclude_vars: list of strings of names of branches not to be used for training + tree_name: Saves: ------ 'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing @@ -38,7 +38,9 @@ def main(json_config, exclude_vars): # -- load in the JSON file logger.info('Loading JSON config') - class_files_dict = json.load(open(json_config)) + config = utils.load_config(json_config) + class_files_dict = config['classes'] + particles = config['particles'] # -- hash the config dictionary to check if the pickled data exists from hashlib import md5 @@ -48,49 +50,31 @@ def sha(s): m.update(s.__repr__()) return m.hexdigest()[:5] - # -- if the pickle exists, use it + #-- if the pickle exists, use it try: data = cPickle.load(open('processed_data_' + sha(class_files_dict) + '.pkl', 'rb')) logger.info('Preprocessed data found in pickle') - X_jets_train = data['X_jets_train'] - X_jets_test = data['X_jets_test'] - X_photons_train = data['X_photons_train'] - X_photons_test = data['X_photons_test'] - X_muons_train = data['X_muons_train'] - X_muons_test = data['X_muons_test'] - y_train = data['y_train'] - y_test = data['y_test'] - w_train = data['w_train'] - w_test = data['w_test'] - varlist = data['varlist'] - - # -- otherwise, process the new data + + # -- otherwise, process the new data except IOError: logger.info('Preprocessed data not found') logger.info('Processing data') # -- transform ROOT files into standard ML format (ndarrays) - X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars) + X, y, w = read_in(class_files_dict, tree_name, particles) + # -- shuffle, split samples into train and test set, scale features - X_jets_train, X_jets_test, \ - X_photons_train, X_photons_test, \ - X_muons_train, X_muons_test, \ - y_train, y_test, \ - w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w) + data = shuffle_split_scale(X, y, w) #X_muons, y, w) + + data.update({ + 'varlist' : [ + branch + for particle_info in particles.values() + for branch in particle_info['branches'] + ] + }) # -- save out to pickle logger.info('Saving processed data to pickle') - cPickle.dump({ - 'X_jets_train' : X_jets_train, - 'X_jets_test' : X_jets_test, - 'X_photons_train' : X_photons_train, - 'X_photons_test' : X_photons_test, - 'X_muons_train' : X_muons_train, - 'X_muons_test' : X_muons_test, - 'y_train' : y_train, - 'y_test' : y_test, - 'w_train' : w_train, - 'w_test' : w_test, - 'varlist' : varlist - }, + cPickle.dump(data, open('processed_data_' + sha(class_files_dict) + '.pkl', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) @@ -101,25 +85,12 @@ def sha(s): Plots should be saved out a pdf with informative names ''' logger.info('Plotting input distributions') - plot_inputs( - X_jets_train, X_jets_test, - X_photons_train, X_photons_test, - X_muons_train, X_muons_test, - y_train, y_test, - w_train, w_test, - varlist - ) - - X_jets_train, X_jets_test, \ - X_photons_train, X_photons_test, \ - X_muons_train, X_muons_test = map(padding, - [ - X_jets_train, X_jets_test, - X_photons_train, X_photons_test, - X_muons_train, X_muons_test - ], - [5, 5, 3, 3, 2, 2] - ) + plot_inputs(data, particles.keys()) + + logger.info('Padding') + for key in data: + if key.startswith('X_'): + data[key] = padding(data[key], particles[key.split('_')[1]]['max_length']) # # -- train # # design a Keras NN with three RNN streams (jets, photons, muons) @@ -147,8 +118,8 @@ def sha(s): # -- read in arguments parser = argparse.ArgumentParser() parser.add_argument('config', help="JSON file that specifies classes and corresponding ROOT files' paths") - parser.add_argument('--exclude', help="names of branches to exclude from training", nargs="*", default=[]) + parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini') args = parser.parse_args() # -- pass arguments to main - sys.exit(main(args.config, args.exclude)) + sys.exit(main(args.config, args.tree)) diff --git a/plotting.py b/plotting.py index 92082a3..c177253 100644 --- a/plotting.py +++ b/plotting.py @@ -40,7 +40,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): min(min(flat_train), min(flat_test)), max(max(flat_train), max(flat_test)), 30) - color = iter(cm.rainbow(np.linspace(0, 1, 2))) + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train))))) # -- loop through the classes for k in range(len(np.unique(y_train))): c = next(color) @@ -48,7 +48,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): bins=bins, histtype='step', normed=True, - label='Train - class: '+str(k), + label='Train - class: ' + str(k), weights=w_train_ext[y_train_ext == k], color=c, linewidth=1) @@ -73,8 +73,10 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): #plt.show() column_counter += 1 -def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, - X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist): +# def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, +# y_train, y_test, w_train, w_test, varlist): +def plot_inputs(data, particle_names): + #X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist): ''' Args: X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the @@ -105,6 +107,16 @@ def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, sets of each class for each feature ''' - _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'Jet') - _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'Photon') - _plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon') + for particle in particle_names: + _plot_X( + data['X_' + particle + '_train'], + data['X_' + particle + '_test'], + data['y_train'], + data['y_test'], + data['w_train'], + data['w_test'], + data['varlist'], + particle) + # _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'jet') + # _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'photon') + #_plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon') diff --git a/utils.py b/utils.py index a61fc72..ca27c59 100644 --- a/utils.py +++ b/utils.py @@ -1,4 +1,5 @@ import logging +import json def configure_logging(): rlogger = logging.getLogger() @@ -7,4 +8,15 @@ def configure_logging(): logging.addLevelName(logging.WARNING, "\033[1;31m{:8}\033[1;0m".format(logging.getLevelName(logging.WARNING))) logging.addLevelName(logging.ERROR, "\033[1;35m{:8}\033[1;0m".format(logging.getLevelName(logging.ERROR))) logging.addLevelName(logging.INFO, "\033[1;32m{:8}\033[1;0m".format(logging.getLevelName(logging.INFO))) - logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG))) \ No newline at end of file + logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG))) + +def load_config(config_file): + # TO DO: make sure that particle names don't have underscores in them + config = json.load(open(config_file, 'r')) + required_keys = ['classes', 'particles'] + + for k in required_keys: + if k not in config.keys(): + raise KeyError('pipeline configuration requires key: {}'.format(k)) + + return config \ No newline at end of file