Skip to content

Commit

Permalink
Refactor scaling and splitting, add hdf5 to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
mickypaganini committed Jun 28, 2016
1 parent 55ca6e3 commit be22be5
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 44 deletions.
99 changes: 57 additions & 42 deletions data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,32 @@
import argparse
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split
import pandautils as pup

def _root2pandas(file_paths, tree_name, **kwargs):
'''converts files from .root to pandas DataFrame
Args:
file_paths: a string like './data/*.root', or
a list of strings with multiple files to open
tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open
kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
Returns:
output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
Note:
if you are working with .root files that contain different branches, you might have to mask your data
in that case, return pd.DataFrame(ss.data)
'''
# def _root2pandas(file_paths, tree_name, **kwargs):
# '''converts files from .root to pandas DataFrame
# Args:
# file_paths: a string like './data/*.root', or
# a list of strings with multiple files to open
# tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open
# kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
# Returns:
# output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
# Note:
# if you are working with .root files that contain different branches, you might have to mask your data
# in that case, return pd.DataFrame(ss.data)
# '''

if isinstance(file_paths, basestring):
files = glob.glob(file_paths)
else:
files = [matched_f for f in file_paths for matched_f in glob.glob(f)]

ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files])
try:
return pd.DataFrame(ss)
except Exception:
return pd.DataFrame(ss.data)
# if isinstance(file_paths, basestring):
# files = glob.glob(file_paths)
# else:
# files = [matched_f for f in file_paths for matched_f in glob.glob(f)]

# ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files])
# try:
# return pd.DataFrame(ss)
# except Exception:
# return pd.DataFrame(ss.data)

def _build_X(events, phrase, exclude_vars):
'''slices related branches into a numpy array
Expand Down Expand Up @@ -79,14 +79,12 @@ def read_in(class_files_dict, exclude_vars):
'''

#convert files to pd data frames, assign key to y, concat all files
all_events = False
for key in class_files_dict.keys():
df = _root2pandas(class_files_dict[key], 'events')
def _make_df(val, key):
df = _root2pandas(val, 'events')
df['y'] = key
if all_events is False:
all_events = df
else:
all_events = pd.concat([all_events, df], ignore_index=True)
return df

all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)

#slice related branches
X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars)
Expand All @@ -98,10 +96,33 @@ def read_in(class_files_dict, exclude_vars):
y = le.fit_transform(all_events['y'].values)

w = all_events['EventWeight'].values
print jet_branches + photon_branches + muon_branches

return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches


def _scale(matrix_train, matrix_test):
'''
Use scikit learn to sclae features to 0 mean, 1 std.
Because of event-level structure, we need to flatten X, scale, and then reshape back into event format.
Args:
matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training
matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing
Returns:
the same matrices after scaling
'''
from sklearn.preprocessing import StandardScaler
ref_test = matrix_test[:, 0]
ref_train = matrix_train[:, 0]
for col in xrange(matrix_train.shape[1]):
scaler = StandardScaler()
matrix_train[:, col] = pup.match_shape(
scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train)
matrix_test[:, col] = pup.match_shape(
scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test)

return matrix_train, matrix_test


def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
'''
takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets, and scales X_jet, \
Expand Down Expand Up @@ -131,15 +152,9 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
X_muons_train, X_muons_test, \
Y_train, Y_test, \
W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4)
# #fit a transformation to the training set of the X_Jet, X_Photon, and X_Muon data and
# #thus apply a transformation to the train data and corresponding test data
# scaler = StandardScaler()
# print type(X_jets_train)
# print X_jets_train
# X_jets_train = scaler.fit_transform(X_jets_train)
# X_jets_test = scaler.transform(X_jets_test)
# X_photons_train = scaler.fit_transform(X_photons_train)
# X_photons_test = scaler.transform(X_photons_test)
# X_muons_train = scaler.fit_transform(X_muons_train)
# X_muons_test = scaler.transform(X_muons_test)

X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test)
X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test)
X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test)

return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test
24 changes: 22 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
from data_processing import read_in, shuffle_split_scale
import pandautils as pup
import deepdish.io as io
#from plotting import plot_inputs, plot_performance
#from nn_model import train, test

Expand All @@ -25,20 +27,38 @@ def main(json_config, exclude_vars):
...
}
exclude_vars: list of strings of names of branches not to be used for training
Saves:
------
'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing
'''
# -- load in the JSON file
class_files_dict = json.load(open(json_config))

# -- transform ROOT files into standard ML format (ndarrays)
X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars)

# -- shuffle, split samples into train and test set, scale features
X_jets_train, X_jets_test, \
X_photons_train, X_photons_test, \
X_muons_train, X_muons_test, \
y_train, y_test, \
w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w)


# -- save out to hdf5
io.save('processed_data.h5', {
'X_jets_train' : X_jets_train,
'X_jets_test' : X_jets_test,
'X_photons_train' : X_photons_train,
'X_photons_test' : X_photons_test,
'X_muons_train' : X_muons_train,
'X_muons_test' : X_muons_test,
'y_train' : y_train,
'y_test' : y_test,
'w_train' : w_train,
'w_test' : w_test,
'varlist' : varlist
})

# -- plot distributions:
# this should produce weighted histograms of the input distributions for all variables
# on each plot, the train and test distributions should be shown for every class
Expand Down

0 comments on commit be22be5

Please sign in to comment.