Refactor scaling and splitting, add hdf5 to pipeline

YaleATLAS · Jun 28, 2016 · be22be5 · be22be5
1 parent 55ca6e3
commit be22be5
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 44 deletions.
diff --git a/data_processing.py b/data_processing.py
@@ -6,32 +6,32 @@
 import argparse
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.cross_validation import train_test_split
+import pandautils as pup
 
-def _root2pandas(file_paths, tree_name, **kwargs):
-    '''converts files from .root to pandas DataFrame
-    Args:
-        file_paths: a string like './data/*.root', or
-                    a list of strings with multiple files to open
-        tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open
-        kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
-    Returns:
-        output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
-    Note:
-        if you are working with .root files that contain different branches, you might have to mask your data
-        in that case, return pd.DataFrame(ss.data)
-    '''
+# def _root2pandas(file_paths, tree_name, **kwargs):
+#     '''converts files from .root to pandas DataFrame
+#     Args:
+#         file_paths: a string like './data/*.root', or
+#                     a list of strings with multiple files to open
+#         tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open
+#         kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
+#     Returns:
+#         output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
+#     Note:
+#         if you are working with .root files that contain different branches, you might have to mask your data
+#         in that case, return pd.DataFrame(ss.data)
+#     '''
 
-    if isinstance(file_paths, basestring):
-        files = glob.glob(file_paths)
-    else:
-        files = [matched_f for f in file_paths for matched_f in glob.glob(f)]
-
-    ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files])
-    try:
-        return pd.DataFrame(ss)
-    except Exception:
-        return pd.DataFrame(ss.data)
+#     if isinstance(file_paths, basestring):
+#         files = glob.glob(file_paths)
+#     else:
+#         files = [matched_f for f in file_paths for matched_f in glob.glob(f)]
 
+#     ss = stack_arrays([root2array(fpath, tree_name, **kwargs).view(np.recarray) for fpath in files])
+#     try:
+#         return pd.DataFrame(ss)
+#     except Exception:
+#         return pd.DataFrame(ss.data)
 
 def _build_X(events, phrase, exclude_vars):
     '''slices related branches into a numpy array
@@ -79,14 +79,12 @@ def read_in(class_files_dict, exclude_vars):
     '''
 
     #convert files to pd data frames, assign key to y, concat all files
-    all_events = False  
-    for key in class_files_dict.keys():
-        df = _root2pandas(class_files_dict[key], 'events')
+    def _make_df(val, key):
+        df = _root2pandas(val, 'events')
         df['y'] = key
-        if all_events is False:
-            all_events = df
-        else:
-            all_events = pd.concat([all_events, df], ignore_index=True)
+        return df
+
+    all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)
 
     #slice related branches
     X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars)
@@ -98,10 +96,33 @@ def read_in(class_files_dict, exclude_vars):
     y = le.fit_transform(all_events['y'].values)
 
     w = all_events['EventWeight'].values
-    print jet_branches + photon_branches + muon_branches
 
     return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches
 
+
+def _scale(matrix_train, matrix_test):
+    '''
+    Use scikit learn to sclae features to 0 mean, 1 std. 
+    Because of event-level structure, we need to flatten X, scale, and then reshape back into event format.
+    Args:
+        matrix_train: X_train [n_ev_train, n_particle_features], numpy ndarray of unscaled features of events allocated for training
+        matrix_test: X_test [n_ev_test, n_particle_features], numpy ndarray of unscaled features of events allocated for testing
+    Returns:
+        the same matrices after scaling
+    '''
+    from sklearn.preprocessing import StandardScaler
+    ref_test = matrix_test[:, 0]
+    ref_train = matrix_train[:, 0]
+    for col in xrange(matrix_train.shape[1]):
+        scaler = StandardScaler()
+        matrix_train[:, col] = pup.match_shape(
+            scaler.fit_transform(pup.flatten(matrix_train[:, col]).reshape(-1, 1)).ravel(), ref_train)
+        matrix_test[:, col] = pup.match_shape(
+            scaler.transform(pup.flatten(matrix_test[:, col]).reshape(-1, 1)).ravel(), ref_test)
+
+    return matrix_train, matrix_test
+
+
 def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
     '''
     takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets, and scales X_jet, \
@@ -131,15 +152,9 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
     X_muons_train, X_muons_test, \
     Y_train, Y_test, \
     W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4)
-    # #fit a transformation to the training set of the X_Jet, X_Photon, and X_Muon data and
-    # #thus apply a transformation to the train data and corresponding test data 
-    # scaler = StandardScaler()
-    # print type(X_jets_train)
-    # print X_jets_train
-    # X_jets_train = scaler.fit_transform(X_jets_train)
-    # X_jets_test = scaler.transform(X_jets_test)
-    # X_photons_train = scaler.fit_transform(X_photons_train)
-    # X_photons_test = scaler.transform(X_photons_test)       
-    # X_muons_train = scaler.fit_transform(X_muons_train)
-    # X_muons_test = scaler.transform(X_muons_test)
+
+    X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test)
+    X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test)
+    X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test)
+
     return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test
diff --git a/pipeline.py b/pipeline.py
@@ -1,5 +1,7 @@
 import json
 from data_processing import read_in, shuffle_split_scale
+import pandautils as pup
+import deepdish.io as io
 #from plotting import plot_inputs, plot_performance
 #from nn_model import train, test
 
@@ -25,20 +27,38 @@ def main(json_config, exclude_vars):
                         ...
                      }
          exclude_vars: list of strings of names of branches not to be used for training   
+    Saves:
+    ------
+        'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing
     '''
     # -- load in the JSON file
     class_files_dict = json.load(open(json_config))
 
     # -- transform ROOT files into standard ML format (ndarrays) 
     X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars)
-    
+
     # -- shuffle, split samples into train and test set, scale features
     X_jets_train, X_jets_test, \
     X_photons_train, X_photons_test, \
     X_muons_train, X_muons_test, \
     y_train, y_test, \
     w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w)
-
+
+    # -- save out to hdf5
+    io.save('processed_data.h5', {
+        'X_jets_train' : X_jets_train,
+        'X_jets_test' : X_jets_test,
+        'X_photons_train' : X_photons_train,
+        'X_photons_test' : X_photons_test,
+        'X_muons_train' : X_muons_train,
+        'X_muons_test' : X_muons_test,
+        'y_train' : y_train,
+        'y_test' : y_test,
+        'w_train' : w_train,
+        'w_test' : w_test,
+        'varlist' : varlist
+    })
+
     # -- plot distributions:
     # this should produce weighted histograms of the input distributions for all variables
     # on each plot, the train and test distributions should be shown for every class