diff --git a/config_hh.json b/config_hh.json new file mode 100644 index 0000000..74b9eb6 --- /dev/null +++ b/config_hh.json @@ -0,0 +1,67 @@ +{ + "classes" : + { + "X400" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X400tohh_yybb.root" + ], + "X350" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X350tohh_yybb.root" + ], + "X325" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X325tohh_yybb.root" + ], + "H300" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_H300_Xtohh_yybb.root" + ], + "X275" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_X275tohh_yybb.root" + ], + "bkg" : + [ + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_ybbj.root", + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_ybjj.root", + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yjjj.root", + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yybb.root", + "/Users/mp744/Documents/CERN/hh2yybb/hh2yybbEventClassifier/ntuples/MGPy8_yybj.root" + ] + }, + + "particles" : + { + "jet" : + { + "branches" : + [ + "jet_pt", + "jet_eta", + "jet_phi", + "jet_m", + "jet_Jvt", + "jet_MV2c10_FixedCutBEff_60", + "jet_MV2c10_FixedCutBEff_70", + "jet_MV2c10_FixedCutBEff_77", + "jet_MV2c10_FixedCutBEff_85" + ], + "max_length" : 5 + }, + + "photon": + { + "branches" : + [ + "photon_pt", + "photon_eta", + "photon_phi", + "photon_isTight", + "photon_ptcone20", + "photon_topoEtcone40" + ], + "max_length" : 3 + } + } +} \ No newline at end of file diff --git a/data_processing.py b/data_processing.py index e803038..8275b03 100644 --- a/data_processing.py +++ b/data_processing.py @@ -5,21 +5,19 @@ from sklearn.cross_validation import train_test_split import pandautils as pup import warnings +import logging +from collections import OrderedDict +from itertools import izip -def _build_X(events, phrase, exclude_vars): - '''slices related branches into a numpy array - Args: - events: a pandas DataFrame containing the complete data by event - phrase: a string like 'Jet' corresponding to the related branches wanted - Returns: - output_array: a numpy array containing data only pertaining to the related branches - ''' - branch_names = [key for key in events.keys() if (key.startswith(phrase) and (key not in exclude_vars))] - sliced_events = events[branch_names].as_matrix() - return sliced_events, branch_names +logger = logging.getLogger('data_processing') +def _pairwise(iterable): + '''s -> (s0, s1), (s2, s3), (s4, s5), ...''' + a = iter(iterable) + return izip(a, a) -def read_in(class_files_dict, exclude_vars): + +def read_in(class_files_dict, tree_name, particles): ''' takes in dict mapping class names to list of root files, loads them and slices them into ML format Args: @@ -40,43 +38,67 @@ def read_in(class_files_dict, exclude_vars): ], ... } - exclude_vars: list of strings of names of branches not to be used for training + tree_name: string, name of the tree to open in the ntuples + particles: dictionary that provides various informations about the different streams in the events, + for example: + { + "jet" : + { + "branches" : + [ + "jet_pt", + "jet_eta" + ], + "max_length" : 5 + }, + "photon" : + { + "branches" : + [ + "photon_pt", + "photon_eta" + ], + "max_length" : 3 + } + } Returns: - X_jets: ndarray [n_ev, n_jet_feat] containing jet related branches - X_photons: ndarray [n_ev, n_photon_feat] containing photon related branches - X_muons: ndarray [n_ev, n_muon_feat] containing muon related branches + X: an OrderedDict containing the feature matrices for the different particle types, e.g.: + X = { + "jet" : X_jet, + "photon" : X_photon, + "muon" : X_muon + } + where each X_ is an ndarray of dimensions [n_ev, n_features] y: ndarray [n_ev, 1] containing the truth labels - w: ndarray [n_ev, 1] containing EventWeights - jet_branches + photon_branches + muon_branches = list of strings that concatenates the individual - lists of variables for each particle type, e.g.: - ['Jet_Px', 'Jet_E', 'Muon_ID', 'Photon_Px'] + w: ndarray [n_ev, 1] containing the event weights + le: LabelEncoder to transform numerical y back to its string values ''' #convert files to pd data frames, assign key to y, concat all files def _make_df(val, key): - df = pup.root2panda(val, 'events') + df = pup.root2panda(val, tree_name) df['y'] = key return df all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True) - - #slice related branches - X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars) - X_photons, photon_branches = _build_X(all_events, 'Photon', exclude_vars) - X_muons, muon_branches = _build_X(all_events, 'Muon', exclude_vars) + X = OrderedDict() + for particle_name, particle_info in particles.iteritems(): + logger.info('Building X_{}'.format(particle_name)) + X[particle_name] = all_events[particle_info["branches"]].values + #transform string labels to integer classes le = LabelEncoder() y = le.fit_transform(all_events['y'].values) - w = all_events['EventWeight'].values + w = all_events['yybb_weight'].values - return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches + return X, y, w, le def _scale(matrix_train, matrix_test): ''' - Use scikit learn to sclae features to 0 mean, 1 std. + Use scikit learn to scale features to 0 mean, 1 std. Because of event-level structure, we need to flatten X, scale, and then reshape back into event format. Args: matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training @@ -99,47 +121,62 @@ def _scale(matrix_train, matrix_test): return matrix_train, matrix_test -def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): +def shuffle_split_scale(X, y, w): ''' - takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets + Shuffle data, split it into test (40%) and training (60%) sets, scale X Args: - X_jets: ndarray [n_ev, n_jet_feat] containing jet related branches - X_photons: ndarray [n_ev, n_photon_feat] containing photon related branches - X_muons: ndarray [n_ev, n_muon_feat] containing muon related branches + X: an OrderedDict containing the feature matrices for the different particle types, e.g.: + X = { + "jet" : X_jet, + "photon" : X_photon, + "muon" : X_muon + } + where each X_ is an ndarray of dimensions [n_ev, n_features] y: ndarray [n_ev, 1] containing the truth labels - w: ndarray [n_ev, 1] containing EventWeights + w: ndarray [n_ev, 1] containing the event weights Returns: - X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the events of jet related branches allocated for training - X_jets_test: ndarray [n_ev_test, n_jet_feat] containing the events of jet related branches allocated for testing - X_photons_train: ndarray [n_ev_train, n_photon_feat] containing the events of photon related branches allocated for training - X_photons_test: ndarray [n_ev_test, n_photon_feat] containing the events of photon related branches allocated for testing - X_muons_train: ndarray [n_ev_train, n_muon_feat] containing the events of muon related branches allocated for training - X_muons_test: ndarray [n_ev_test, n_muon_feat] containing the events of muon related branches allocated for testing - Y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training - Y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing - W_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training - W_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } ''' - #shuffle events & split into testing and training sets - X_jets_train, X_jets_test, \ - X_photons_train, X_photons_test, \ - X_muons_train, X_muons_test, \ - Y_train, Y_test, \ - W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4) + logger.info('Shuffling, splitting and scaling') - X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test) - X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test) - X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test) + data_tuple = train_test_split(*(X.values() + [y, w]), test_size=0.4) - return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test + data = OrderedDict() + for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])): + data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test) + data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:] + + return data -def zero_padding(X, max_length): - ''' +def padding(X, max_length, value=-999): + ''' + Transforms X to a 3D array where the dimensions correspond to [n_ev, n_particles, n_features]. + n_particles is now fixed and equal to max_length. + If the number of particles in an event was < max_length, the missing particles will be filled with default values + If the number of particles in an event was > max_length, the excess particles will be removed + Args: + X: ndarray [n_ev, n_features] with an arbitrary number of particles per event + max_length: int, the number of particles to keep per event + value (optional): the value to input in case there are not enough particles in the event, default=-999 + Returns: + X_pad: ndarray [n_ev, n_particles, n_features], padded version of X with fixed number of particles + Note: + Use Masking to avoid the particles with artificial entries = -999 ''' - data = -999 * np.ones((X.shape[0], max_length, X.shape[1]), dtype='float32') + X_pad = value * np.ones((X.shape[0], max_length, X.shape[1]), dtype='float32') for i, row in enumerate(X): - data[i, :min(len(row[0]), max_length), :] = np.array(row.tolist()).T[:min(len(row[0]), max_length), :] + X_pad[i, :min(len(row[0]), max_length), :] = np.array(row.tolist()).T[:min(len(row[0]), max_length), :] - return data + return X_pad diff --git a/pipeline.py b/pipeline.py index 1ff81cd..5d7844d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,5 +1,5 @@ import json -from data_processing import read_in, shuffle_split_scale, zero_padding +from data_processing import read_in, shuffle_split_scale, padding import pandautils as pup import cPickle from plotting import plot_inputs, plot_NN @@ -10,11 +10,11 @@ #from plotting import plot_inputs, plot_performance #from nn_model import train, test -def main(json_config, exclude_vars): +def main(json_config, tree_name): ''' Args: ----- - json_config: a JSON file, containing a dictionary that links the names of the different + json_config: path to a JSON file, containing a dictionary that links the names of the different classes in the classification problem to the paths of the ROOT files associated with each class; for example: @@ -31,16 +31,18 @@ def main(json_config, exclude_vars): ], ... } - exclude_vars: list of strings of names of branches not to be used for training + tree_name: string, name of the tree that contains the correct branches Saves: ------ - 'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing + 'processed_data_.pkl': dictionary with processed ndarrays (X, y, w) for all particles for training and testing ''' logger = logging.getLogger('Main') # -- load in the JSON file - logger.info('Loading JSON config') - class_files_dict = json.load(open(json_config)) + logger.info('Loading information from ' + json_config) + config = utils.load_config(json_config) + class_files_dict = config['classes'] + particles_dict = config['particles'] # -- hash the config dictionary to check if the pickled data exists from hashlib import md5 @@ -50,52 +52,53 @@ def sha(s): m.update(s.__repr__()) return m.hexdigest()[:5] - # -- if the pickle exists, use it + #-- if the pickle exists, use it + pickle_name = 'processed_data_' + sha(config) + '.pkl' try: - data = cPickle.load(open('processed_data_' + sha(class_files_dict) + '.pkl', 'rb')) - logger.info('Preprocessed data found in pickle') - X_jets_train = data['X_jets_train'] - X_jets_test = data['X_jets_test'] - X_photons_train = data['X_photons_train'] - X_photons_test = data['X_photons_test'] - X_muons_train = data['X_muons_train'] - X_muons_test = data['X_muons_test'] - y_train = data['y_train'] - y_test = data['y_test'] - w_train = data['w_train'] - w_test = data['w_test'] - varlist = data['varlist'] - - # -- otherwise, process the new data + logger.info('Attempting to read from {}'.format(pickle_name)) + data = cPickle.load(open(pickle_name, 'rb')) + logger.info('Pre-processed data found and loaded from pickle') + # -- otherwise, process the new data except IOError: - logger.info('Preprocessed data not found') + logger.info('Pre-processed data not found in {}'.format(pickle_name)) logger.info('Processing data') # -- transform ROOT files into standard ML format (ndarrays) - X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars) + X, y, w, le = read_in(class_files_dict, tree_name, particles_dict) + # -- shuffle, split samples into train and test set, scale features - X_jets_train, X_jets_test, \ - X_photons_train, X_photons_test, \ - X_muons_train, X_muons_test, \ - y_train, y_test, \ - w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w) + data = shuffle_split_scale(X, y, w) + + data.update({ + 'varlist' : [ + branch + for particle_info in particles_dict.values() + for branch in particle_info['branches'] + ], + 'LabelEncoder' : le + }) + + # -- plot distributions: + ''' + This should produce normed, weighted histograms of the input distributions for all variables + The train and test distributions should be shown for every class + Plots should be saved out a pdf with informative names + ''' + logger.info('Saving input distributions in ./plots/') + plot_inputs(data, particles_dict.keys()) + + logger.info('Padding') + for key in data: + if key.startswith('X_'): + data[key] = padding(data[key], particles_dict[key.split('_')[1]]['max_length']) + # ^ assuming naming convention: X__train, X__test + # -- save out to pickle - logger.info('Saving processed data to pickle') - cPickle.dump({ - 'X_jets_train' : X_jets_train, - 'X_jets_test' : X_jets_test, - 'X_photons_train' : X_photons_train, - 'X_photons_test' : X_photons_test, - 'X_muons_train' : X_muons_train, - 'X_muons_test' : X_muons_test, - 'y_train' : y_train, - 'y_test' : y_test, - 'w_train' : w_train, - 'w_test' : w_test, - 'varlist' : varlist - }, - open('processed_data_' + sha(class_files_dict) + '.pkl', 'wb'), + logger.info('Saving processed data to {}'.format(pickle_name)) + cPickle.dump(data, + open(pickle_name, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) +<<<<<<< HEAD # -- plot distributions: ''' This should produce normed, weighted histograms of the input distributions for all variables @@ -123,6 +126,38 @@ def sha(s): [5, 5, 3, 3, 2, 2] ) +||||||| merged common ancestors + # -- plot distributions: + ''' + This should produce normed, weighted histograms of the input distributions for all variables + The train and test distributions should be shown for every class + Plots should be saved out a pdf with informative names + ''' + logger.info('Plotting input distributions') + plot_inputs( + X_jets_train, X_jets_test, + X_photons_train, X_photons_test, + X_muons_train, X_muons_test, + y_train, y_test, + w_train, w_test, + varlist + ) + + X_jets_train, X_jets_test, \ + X_photons_train, X_photons_test, \ + X_muons_train, X_muons_test = map(zero_padding, + [ + X_jets_train, X_jets_test, + X_photons_train, X_photons_test, + X_muons_train, X_muons_test + ], + [5, 5, 3, 3, 2, 2] + ) + + print X_jets_train.shape, X_photons_train.shape + +======= +>>>>>>> 1a214b6daea28f00860eb31110c2790fe6417efb # # -- train # # design a Keras NN with three RNN streams (jets, photons, muons) io.save(('X_jets_NN.h5'), NN(X_jets_train, X_jets_test, y_train)) @@ -139,16 +174,16 @@ def sha(s): # # combine the outputs and process them through a bunch of FF layers # # use a validation split of 20% # # save out the weights to hdf5 and the model to yaml - # net = train(X_jets_train, X_photons_train, X_muons_train, y_train, w_train) + # net = train(data) # # -- test # # evaluate performance on the test set - # yhat = test(net, X_jets_test, X_photons_test, X_muons_test, y_test, w_test) + # yhat = test(net, data) # # -- plot performance # # produce ROC curves to evaluate performance # # save them out to pdf - # plot_performance(yhat, y_test, w_test) + # plot_performance(yhat, data['y_test'], data['w_test']) if __name__ == '__main__': @@ -159,9 +194,9 @@ def sha(s): # -- read in arguments parser = argparse.ArgumentParser() - parser.add_argument('config', help="JSON file that specifies classes and corresponding ROOT files' paths") - parser.add_argument('--exclude', help="names of branches to exclude from training", nargs="*", default=[]) + parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths") + parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini') args = parser.parse_args() # -- pass arguments to main - sys.exit(main(args.config, args.exclude)) + sys.exit(main(args.config, args.tree)) diff --git a/plotting.py b/plotting.py index 92452e5..505ff3a 100644 --- a/plotting.py +++ b/plotting.py @@ -4,17 +4,19 @@ from matplotlib.pyplot import cm import pandautils as pup import os +from sklearn.preprocessing import LabelEncoder -def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): +def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, le, feature): ''' Args: train: ndarray [n_ev_train, n_muon_feat] containing the events allocated for training test: ndarray [n_ev_test, n_muon_feat] containing the events allocated for testing - y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training - y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing + y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training in numerical format + y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing in numerical format w_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training w_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing - varlist: list of names of branches like 'Jet_px', 'Photon_E', 'Muon_Iso' + varlist: list of names of branches like 'jet_px', 'photon_E', 'muon_Iso' + le: LabelEncoder to transform numerical y back to its string values feature: a string like 'Jet', 'Muon', 'Photon' Returns: Saves .pdf histograms for each feature-related branch plotting the training and test sets for each class @@ -36,11 +38,12 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): flat_test = pup.flatten(test[:, column_counter]) matplotlib.rcParams.update({'font.size': 16}) fig = plt.figure(figsize=(11.69, 8.27), dpi=100) + bins = np.linspace( min(min(flat_train), min(flat_test)), max(max(flat_train), max(flat_test)), 30) - color = iter(cm.rainbow(np.linspace(0, 1, 2))) + color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train))))) # -- loop through the classes for k in range(len(np.unique(y_train))): c = next(color) @@ -48,7 +51,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): bins=bins, histtype='step', normed=True, - label='Train - class: '+str(k), + label='Train - ' + le.inverse_transform(k), weights=w_train_ext[y_train_ext == k], color=c, linewidth=1) @@ -56,55 +59,43 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature): bins=bins, histtype='step', normed=True, - label='Test - class: ' + str(k), + label='Test - ' + le.inverse_transform(k), weights=w_test_ext[y_test_ext == k], color=c, linewidth=2, linestyle='dashed') - plt.xlabel(key) + plt.title(key) plt.yscale('log') plt.ylabel('Weighted Events') - plt.legend() + plt.legend(prop={'size': 10}, fancybox=True, framealpha=0.5) try: plt.savefig(os.path.join('plots', key + '.pdf')) except IOError: os.makedirs('plots') plt.savefig(os.path.join('plots', key + '.pdf')) - #plt.show() column_counter += 1 -def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, - X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist): +def plot_inputs(data, particle_names): ''' Args: - X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the - events of jet related branches allocated for training - X_jets_test: ndarray [n_ev_test, n_jet_feat] containing the - events of jet related branches allocated for testing - X_photons_train: ndarray [n_ev_train, n_photon_feat] containing - the events of photon related branches allocated for training - X_photons_test: ndarray [n_ev_test, n_photon_feat] containing - the events of photon related branches allocated for testing - X_muons_train: ndarray [n_ev_train, n_muon_feat] containing the - events of muon related branches allocated for training - X_muons_test: ndarray [n_ev_test, n_muon_feat] containing the - events of muon related branches allocated for testing - Y_train: ndarray [n_ev_train, 1] containing the shuffled truth - labels for training - Y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels - allocated for testing - W_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights - allocated for training - W_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights - allocated for testing - varlist: list of strings that concatenates the individual - lists of variables for each particle type, e.g.: - ['Jet_Px', 'Jet_E', 'Muon_ID', 'Photon_Px'] + data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: + data = { + "X_jet_train" : X_jet_train, + "X_jet_test" : X_jet_test, + "X_photon_train" : X_photon_train, + "X_photon_test" : X_photon_test, + "y_train" : y_train, + "y_test" : y_test, + "w_train" : w_train, + "w_test" : w_test + } + particle_names: list of strings, names of particle streams Returns: Saves .pdf histograms plotting the training and test sets of each class for each feature ''' +<<<<<<< HEAD _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'Jet') _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'Photon') _plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon') @@ -129,3 +120,21 @@ def plot_NN(yhat, y_test, w_test): plt.legend() plt.show() +||||||| merged common ancestors + _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'Jet') + _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'Photon') + _plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon') +======= + for particle in particle_names: + _plot_X( + data['X_' + particle + '_train'], + data['X_' + particle + '_test'], + data['y_train'], + data['y_test'], + data['w_train'], + data['w_test'], + data['varlist'], + data['LabelEncoder'], + particle + ) +>>>>>>> 1a214b6daea28f00860eb31110c2790fe6417efb diff --git a/utils.py b/utils.py index a61fc72..e62f570 100644 --- a/utils.py +++ b/utils.py @@ -1,4 +1,5 @@ import logging +import json def configure_logging(): rlogger = logging.getLogger() @@ -7,4 +8,23 @@ def configure_logging(): logging.addLevelName(logging.WARNING, "\033[1;31m{:8}\033[1;0m".format(logging.getLevelName(logging.WARNING))) logging.addLevelName(logging.ERROR, "\033[1;35m{:8}\033[1;0m".format(logging.getLevelName(logging.ERROR))) logging.addLevelName(logging.INFO, "\033[1;32m{:8}\033[1;0m".format(logging.getLevelName(logging.INFO))) - logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG))) \ No newline at end of file + logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG))) + +def load_config(config_file): + # TO DO: validate types of entries in the config + config = json.load(open(config_file, 'r')) + required_keys = ['classes', 'particles'] + required_particle_keys = ['branches', 'max_length'] + + for k in required_keys: + if k not in config.keys(): + raise KeyError('Pipeline configuration requires key: {}'.format(k)) + + for particle_name, particle_info in config['particles'].iteritems(): + if '_' in particle_name: + raise ValueError('Particle names cannot have _ in them') + for k in required_particle_keys: + if k not in particle_info.keys(): + raise KeyError('Particle configuration requires key: {}'.format(k)) + + return config \ No newline at end of file