Skip to content

Commit

Permalink
Refactoring to generalize to N streams
Browse files Browse the repository at this point in the history
  • Loading branch information
mickypaganini committed Jul 6, 2016
1 parent 21b357a commit 2d59912
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 99 deletions.
72 changes: 38 additions & 34 deletions data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,30 @@
from sklearn.cross_validation import train_test_split
import pandautils as pup
import warnings
import logging
from collections import OrderedDict
from itertools import izip

def _build_X(events, phrase, exclude_vars):
logger = logging.getLogger('data_processing')

def _pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return izip(a, a)

def _build_X(events, particle_branches):
'''slices related branches into a numpy array
Args:
events: a pandas DataFrame containing the complete data by event
phrase: a string like 'Jet' corresponding to the related branches wanted
Returns:
output_array: a numpy array containing data only pertaining to the related branches
'''
branch_names = [key for key in events.keys() if (key.startswith(phrase) and (key not in exclude_vars))]
sliced_events = events[branch_names].as_matrix()
return sliced_events, branch_names
sliced_events = events[particle_branches].values
return sliced_events


def read_in(class_files_dict, exclude_vars):
def read_in(class_files_dict, tree_name, streams):
'''
takes in dict mapping class names to list of root files, loads them and slices them into ML format
Args:
Expand All @@ -40,7 +49,8 @@ def read_in(class_files_dict, exclude_vars):
],
...
}
exclude_vars: list of strings of names of branches not to be used for training
tree_name: string, name of the tree to open in the ntuples
Returns:
X_jets: ndarray [n_ev, n_jet_feat] containing jet related branches
X_photons: ndarray [n_ev, n_photon_feat] containing photon related branches
Expand All @@ -54,24 +64,26 @@ def read_in(class_files_dict, exclude_vars):

#convert files to pd data frames, assign key to y, concat all files
def _make_df(val, key):
df = pup.root2panda(val, 'events')
df = pup.root2panda(val, tree_name)
df['y'] = key
return df

all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)

#slice related branches
X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars)
X_photons, photon_branches = _build_X(all_events, 'Photon', exclude_vars)
X_muons, muon_branches = _build_X(all_events, 'Muon', exclude_vars)

X = OrderedDict()
for stream_name, stream_info in streams.iteritems():
logger.info('building X_{}'.format(stream_name))
X[stream_name] = _build_X(all_events, stream_info["branches"])

#transform string labels to integer classes
le = LabelEncoder()
y = le.fit_transform(all_events['y'].values)

w = all_events['EventWeight'].values
#w = all_events['eventWeight'].values
w = all_events['yybb_weight'].values

return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches
return X, y, w
# return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches


def _scale(matrix_train, matrix_test):
Expand Down Expand Up @@ -99,7 +111,8 @@ def _scale(matrix_train, matrix_test):
return matrix_train, matrix_test


def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
#def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
def shuffle_split_scale(X, y, w):
'''
takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets
Args:
Expand All @@ -109,29 +122,20 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
y: ndarray [n_ev, 1] containing the truth labels
w: ndarray [n_ev, 1] containing EventWeights
Returns:
X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the events of jet related branches allocated for training
X_jets_test: ndarray [n_ev_test, n_jet_feat] containing the events of jet related branches allocated for testing
X_photons_train: ndarray [n_ev_train, n_photon_feat] containing the events of photon related branches allocated for training
X_photons_test: ndarray [n_ev_test, n_photon_feat] containing the events of photon related branches allocated for testing
X_muons_train: ndarray [n_ev_train, n_muon_feat] containing the events of muon related branches allocated for training
X_muons_test: ndarray [n_ev_test, n_muon_feat] containing the events of muon related branches allocated for testing
Y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training
Y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing
W_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training
W_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing
'''
#shuffle events & split into testing and training sets
X_jets_train, X_jets_test, \
X_photons_train, X_photons_test, \
X_muons_train, X_muons_test, \
Y_train, Y_test, \
W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4)
logger.info('shuffling, splitting and scaling X')

data_tuple = train_test_split(*(X.values() + [y, w]), test_size=0.4)

X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test)
X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test)
X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test)
data = OrderedDict()
for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])):
data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test)

return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test
data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:]

return data


def padding(X, max_length, value=-999):
Expand Down
85 changes: 28 additions & 57 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#from plotting import plot_inputs, plot_performance
#from nn_model import train, test

def main(json_config, exclude_vars):
def main(json_config, tree_name):
'''
Args:
-----
Expand All @@ -29,7 +29,7 @@ def main(json_config, exclude_vars):
],
...
}
exclude_vars: list of strings of names of branches not to be used for training
tree_name:
Saves:
------
'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing
Expand All @@ -38,7 +38,9 @@ def main(json_config, exclude_vars):

# -- load in the JSON file
logger.info('Loading JSON config')
class_files_dict = json.load(open(json_config))
config = utils.load_config(json_config)
class_files_dict = config['classes']
particles = config['particles']

# -- hash the config dictionary to check if the pickled data exists
from hashlib import md5
Expand All @@ -48,49 +50,31 @@ def sha(s):
m.update(s.__repr__())
return m.hexdigest()[:5]

# -- if the pickle exists, use it
#-- if the pickle exists, use it
try:
data = cPickle.load(open('processed_data_' + sha(class_files_dict) + '.pkl', 'rb'))
logger.info('Preprocessed data found in pickle')
X_jets_train = data['X_jets_train']
X_jets_test = data['X_jets_test']
X_photons_train = data['X_photons_train']
X_photons_test = data['X_photons_test']
X_muons_train = data['X_muons_train']
X_muons_test = data['X_muons_test']
y_train = data['y_train']
y_test = data['y_test']
w_train = data['w_train']
w_test = data['w_test']
varlist = data['varlist']

# -- otherwise, process the new data

# -- otherwise, process the new data
except IOError:
logger.info('Preprocessed data not found')
logger.info('Processing data')
# -- transform ROOT files into standard ML format (ndarrays)
X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars)
X, y, w = read_in(class_files_dict, tree_name, particles)

# -- shuffle, split samples into train and test set, scale features
X_jets_train, X_jets_test, \
X_photons_train, X_photons_test, \
X_muons_train, X_muons_test, \
y_train, y_test, \
w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w)
data = shuffle_split_scale(X, y, w) #X_muons, y, w)

data.update({
'varlist' : [
branch
for particle_info in particles.values()
for branch in particle_info['branches']
]
})
# -- save out to pickle
logger.info('Saving processed data to pickle')
cPickle.dump({
'X_jets_train' : X_jets_train,
'X_jets_test' : X_jets_test,
'X_photons_train' : X_photons_train,
'X_photons_test' : X_photons_test,
'X_muons_train' : X_muons_train,
'X_muons_test' : X_muons_test,
'y_train' : y_train,
'y_test' : y_test,
'w_train' : w_train,
'w_test' : w_test,
'varlist' : varlist
},
cPickle.dump(data,
open('processed_data_' + sha(class_files_dict) + '.pkl', 'wb'),
protocol=cPickle.HIGHEST_PROTOCOL)

Expand All @@ -101,25 +85,12 @@ def sha(s):
Plots should be saved out a pdf with informative names
'''
logger.info('Plotting input distributions')
plot_inputs(
X_jets_train, X_jets_test,
X_photons_train, X_photons_test,
X_muons_train, X_muons_test,
y_train, y_test,
w_train, w_test,
varlist
)

X_jets_train, X_jets_test, \
X_photons_train, X_photons_test, \
X_muons_train, X_muons_test = map(padding,
[
X_jets_train, X_jets_test,
X_photons_train, X_photons_test,
X_muons_train, X_muons_test
],
[5, 5, 3, 3, 2, 2]
)
plot_inputs(data, particles.keys())

logger.info('Padding')
for key in data:
if key.startswith('X_'):
data[key] = padding(data[key], particles[key.split('_')[1]]['max_length'])

# # -- train
# # design a Keras NN with three RNN streams (jets, photons, muons)
Expand Down Expand Up @@ -147,8 +118,8 @@ def sha(s):
# -- read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('config', help="JSON file that specifies classes and corresponding ROOT files' paths")
parser.add_argument('--exclude', help="names of branches to exclude from training", nargs="*", default=[])
parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
args = parser.parse_args()

# -- pass arguments to main
sys.exit(main(args.config, args.exclude))
sys.exit(main(args.config, args.tree))
26 changes: 19 additions & 7 deletions plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature):
min(min(flat_train), min(flat_test)),
max(max(flat_train), max(flat_test)),
30)
color = iter(cm.rainbow(np.linspace(0, 1, 2)))
color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train)))))
# -- loop through the classes
for k in range(len(np.unique(y_train))):
c = next(color)
_ = plt.hist(flat_train[y_train_ext == k],
bins=bins,
histtype='step',
normed=True,
label='Train - class: '+str(k),
label='Train - class: ' + str(k),
weights=w_train_ext[y_train_ext == k],
color=c,
linewidth=1)
Expand All @@ -73,8 +73,10 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature):
#plt.show()
column_counter += 1

def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test,
X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist):
# def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test,
# y_train, y_test, w_train, w_test, varlist):
def plot_inputs(data, particle_names):
#X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist):
'''
Args:
X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the
Expand Down Expand Up @@ -105,6 +107,16 @@ def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test,
sets of each class for each feature
'''

_plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'Jet')
_plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'Photon')
_plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon')
for particle in particle_names:
_plot_X(
data['X_' + particle + '_train'],
data['X_' + particle + '_test'],
data['y_train'],
data['y_test'],
data['w_train'],
data['w_test'],
data['varlist'],
particle)
# _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'jet')
# _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'photon')
#_plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon')
14 changes: 13 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import json

def configure_logging():
rlogger = logging.getLogger()
Expand All @@ -7,4 +8,15 @@ def configure_logging():
logging.addLevelName(logging.WARNING, "\033[1;31m{:8}\033[1;0m".format(logging.getLevelName(logging.WARNING)))
logging.addLevelName(logging.ERROR, "\033[1;35m{:8}\033[1;0m".format(logging.getLevelName(logging.ERROR)))
logging.addLevelName(logging.INFO, "\033[1;32m{:8}\033[1;0m".format(logging.getLevelName(logging.INFO)))
logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG)))
logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG)))

def load_config(config_file):
# TO DO: make sure that particle names don't have underscores in them
config = json.load(open(config_file, 'r'))
required_keys = ['classes', 'particles']

for k in required_keys:
if k not in config.keys():
raise KeyError('pipeline configuration requires key: {}'.format(k))

return config

0 comments on commit 2d59912

Please sign in to comment.