diff --git a/data_processing.py b/data_processing.py index ba710ff..7fa92dc 100644 --- a/data_processing.py +++ b/data_processing.py @@ -6,32 +6,32 @@ import argparse from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.cross_validation import train_test_split +import pandautils as pup -def _root2pandas(file_paths, tree_name, **kwargs): - '''converts files from .root to pandas DataFrame - Args: - file_paths: a string like './data/*.root', or - a list of strings with multiple files to open - tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open - kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc - Returns: - output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored - Note: - if you are working with .root files that contain different branches, you might have to mask your data - in that case, return pd.DataFrame(ss.data) - ''' +# def _root2pandas(file_paths, tree_name, **kwargs): +# '''converts files from .root to pandas DataFrame +# Args: +# file_paths: a string like './data/*.root', or +# a list of strings with multiple files to open +# tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open +# kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc +# Returns: +# output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored +# Note: +# if you are working with .root files that contain different branches, you might have to mask your data +# in that case, return pd.DataFrame(ss.data) +# ''' - if isinstance(file_paths, basestring): - files = glob.glob(file_paths) - else: - files = [matched_f for f in file_paths for matched_f in glob.glob(f)] - - ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files]) - try: - return pd.DataFrame(ss) - except Exception: - return pd.DataFrame(ss.data) +# if isinstance(file_paths, basestring): +# files = glob.glob(file_paths) +# else: +# files = [matched_f for f in file_paths for matched_f in glob.glob(f)] +# ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files]) +# try: +# return pd.DataFrame(ss) +# except Exception: +# return pd.DataFrame(ss.data) def _build_X(events, phrase, exclude_vars): '''slices related branches into a numpy array @@ -79,14 +79,12 @@ def read_in(class_files_dict, exclude_vars): ''' #convert files to pd data frames, assign key to y, concat all files - all_events = False - for key in class_files_dict.keys(): - df = _root2pandas(class_files_dict[key], 'events') + def _make_df(val, key): + df = _root2pandas(val, 'events') df['y'] = key - if all_events is False: - all_events = df - else: - all_events = pd.concat([all_events, df], ignore_index=True) + return df + + all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True) #slice related branches X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars) @@ -98,10 +96,33 @@ def read_in(class_files_dict, exclude_vars): y = le.fit_transform(all_events['y'].values) w = all_events['EventWeight'].values - print jet_branches + photon_branches + muon_branches return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches + +def _scale(matrix_train, matrix_test): + ''' + Use scikit learn to sclae features to 0 mean, 1 std. + Because of event-level structure, we need to flatten X, scale, and then reshape back into event format. + Args: + matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training + matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing + Returns: + the same matrices after scaling + ''' + from sklearn.preprocessing import StandardScaler + ref_test = matrix_test[:, 0] + ref_train = matrix_train[:, 0] + for col in xrange(matrix_train.shape[1]): + scaler = StandardScaler() + matrix_train[:, col] = pup.match_shape( + scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train) + matrix_test[:, col] = pup.match_shape( + scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test) + + return matrix_train, matrix_test + + def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): ''' takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets, and scales X_jet, \ @@ -131,15 +152,9 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w): X_muons_train, X_muons_test, \ Y_train, Y_test, \ W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4) - # #fit a transformation to the training set of the X_Jet, X_Photon, and X_Muon data and - # #thus apply a transformation to the train data and corresponding test data - # scaler = StandardScaler() - # print type(X_jets_train) - # print X_jets_train - # X_jets_train = scaler.fit_transform(X_jets_train) - # X_jets_test = scaler.transform(X_jets_test) - # X_photons_train = scaler.fit_transform(X_photons_train) - # X_photons_test = scaler.transform(X_photons_test) - # X_muons_train = scaler.fit_transform(X_muons_train) - # X_muons_test = scaler.transform(X_muons_test) + + X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test) + X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test) + X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test) + return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test diff --git a/pipeline.py b/pipeline.py index 70abcb1..1e682b5 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,5 +1,7 @@ import json from data_processing import read_in, shuffle_split_scale +import pandautils as pup +import deepdish.io as io #from plotting import plot_inputs, plot_performance #from nn_model import train, test @@ -25,20 +27,38 @@ def main(json_config, exclude_vars): ... } exclude_vars: list of strings of names of branches not to be used for training + Saves: + ------ + 'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing ''' # -- load in the JSON file class_files_dict = json.load(open(json_config)) # -- transform ROOT files into standard ML format (ndarrays) X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars) - + # -- shuffle, split samples into train and test set, scale features X_jets_train, X_jets_test, \ X_photons_train, X_photons_test, \ X_muons_train, X_muons_test, \ y_train, y_test, \ w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w) - + + # -- save out to hdf5 + io.save('processed_data.h5', { + 'X_jets_train' : X_jets_train, + 'X_jets_test' : X_jets_test, + 'X_photons_train' : X_photons_train, + 'X_photons_test' : X_photons_test, + 'X_muons_train' : X_muons_train, + 'X_muons_test' : X_muons_test, + 'y_train' : y_train, + 'y_test' : y_test, + 'w_train' : w_train, + 'w_test' : w_test, + 'varlist' : varlist + }) + # -- plot distributions: # this should produce weighted histograms of the input distributions for all variables # on each plot, the train and test distributions should be shown for every class