From 2d599128fa55c9133bfdd031ca34e50e544b4952 Mon Sep 17 00:00:00 2001
From: Michela Paganini <micky.91@hotmail.com>
Date: Wed, 6 Jul 2016 02:06:29 +0200
Subject: [PATCH] Refactoring to generalize to N streams

---
 data_processing.py | 72 ++++++++++++++++++++-------------------
 pipeline.py        | 85 +++++++++++++++-------------------------------
 plotting.py        | 26 ++++++++++----
 utils.py           | 14 +++++++-
 4 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/data_processing.py b/data_processing.py
index 378cd4c..4dd1256 100644
--- a/data_processing.py
+++ b/data_processing.py
@@ -5,8 +5,18 @@
 from sklearn.cross_validation import train_test_split
 import pandautils as pup
 import warnings
+import logging
+from collections import OrderedDict
+from itertools import izip
 
-def _build_X(events, phrase, exclude_vars):
+logger = logging.getLogger('data_processing')
+
+def _pairwise(iterable):
+    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
+    a = iter(iterable)
+    return izip(a, a)
+
+def _build_X(events, particle_branches):
     '''slices related branches into a numpy array
     Args:
         events: a pandas DataFrame containing the complete data by event
@@ -14,12 +24,11 @@ def _build_X(events, phrase, exclude_vars):
     Returns:
         output_array: a numpy array containing data only pertaining to the related branches
     '''
-    branch_names = [key for key in events.keys() if (key.startswith(phrase) and (key not in exclude_vars))]
-    sliced_events = events[branch_names].as_matrix()
-    return sliced_events, branch_names
+    sliced_events = events[particle_branches].values
+    return sliced_events
 
 
-def read_in(class_files_dict, exclude_vars):
+def read_in(class_files_dict, tree_name, streams):
     '''
     takes in dict mapping class names to list of root files, loads them and slices them into ML format
     Args:
@@ -40,7 +49,8 @@ def read_in(class_files_dict, exclude_vars):
                             ],
                             ...
                           } 
-        exclude_vars: list of strings of names of branches not to be used for training
+        tree_name: string, name of the tree to open in the ntuples
+        
     Returns:
         X_jets: ndarray [n_ev, n_jet_feat] containing jet related branches
         X_photons: ndarray [n_ev, n_photon_feat] containing photon related branches
@@ -54,24 +64,26 @@ def read_in(class_files_dict, exclude_vars):
     
     #convert files to pd data frames, assign key to y, concat all files
     def _make_df(val, key):
-        df = pup.root2panda(val, 'events')
+        df = pup.root2panda(val, tree_name)
         df['y'] = key
         return df
 
     all_events = pd.concat([_make_df(val, key) for key, val in class_files_dict.iteritems()], ignore_index=True)
-        
-    #slice related branches
-    X_jets, jet_branches = _build_X(all_events, 'Jet', exclude_vars)
-    X_photons, photon_branches = _build_X(all_events, 'Photon', exclude_vars)
-    X_muons, muon_branches = _build_X(all_events, 'Muon', exclude_vars)
     
+    X = OrderedDict()
+    for stream_name, stream_info in streams.iteritems():
+        logger.info('building X_{}'.format(stream_name))
+        X[stream_name] = _build_X(all_events, stream_info["branches"])
+
     #transform string labels to integer classes
     le = LabelEncoder()
     y = le.fit_transform(all_events['y'].values)
     
-    w = all_events['EventWeight'].values
+    #w = all_events['eventWeight'].values
+    w = all_events['yybb_weight'].values
     
-    return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches
+    return X, y, w
+    # return X_jets, X_photons, X_muons, y, w, jet_branches + photon_branches + muon_branches
 
 
 def _scale(matrix_train, matrix_test):
@@ -99,7 +111,8 @@ def _scale(matrix_train, matrix_test):
     return matrix_train, matrix_test
 
 
-def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
+#def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
+def shuffle_split_scale(X, y, w):
     '''
     takes in X_jets, X_photons, X_Muons, y and w nd arrays, shuffles them, splits them into test (40%) and training (60%) sets
     Args:
@@ -109,29 +122,20 @@ def shuffle_split_scale(X_jets, X_photons, X_muons, y, w):
         y: ndarray [n_ev, 1] containing the truth labels
         w: ndarray [n_ev, 1] containing EventWeights
     Returns:
-        X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the events of jet related branches allocated for training
-        X_jets_test: ndarray [n_ev_test, n_jet_feat] containing the events of jet related branches allocated for testing
-        X_photons_train: ndarray [n_ev_train, n_photon_feat] containing the events of photon related branches allocated for training
-        X_photons_test: ndarray [n_ev_test, n_photon_feat] containing the events of photon related branches allocated for testing
-        X_muons_train: ndarray [n_ev_train, n_muon_feat] containing the events of muon related branches allocated for training
-        X_muons_test: ndarray [n_ev_test, n_muon_feat] containing the events of muon related branches allocated for testing
-        Y_train: ndarray [n_ev_train, 1] containing the shuffled truth labels for training
-        Y_test: ndarray [n_ev_test, 1] containing the shuffled truth labels allocated for testing
-        W_train: ndarray [n_ev_train, 1] containing the shuffled EventWeights allocated for training
-        W_test: ndarray [n_ev_test, 1] containing the shuffled EventWeights allocated for testing
+
     '''
     #shuffle events & split into testing and training sets
-    X_jets_train, X_jets_test, \
-    X_photons_train, X_photons_test, \
-    X_muons_train, X_muons_test, \
-    Y_train, Y_test, \
-    W_train, W_test = train_test_split(X_jets, X_photons, X_muons, y, w, test_size=0.4)
+    logger.info('shuffling, splitting and scaling X')
+
+    data_tuple = train_test_split(*(X.values() + [y, w]), test_size=0.4)
 
-    X_jets_train, X_jets_test = _scale(X_jets_train, X_jets_test)
-    X_photons_train, X_photons_test = _scale(X_photons_train, X_photons_test)
-    X_muons_train, X_muons_test = _scale(X_muons_train, X_muons_test)
+    data = OrderedDict()
+    for particle, (train, test) in zip(X.keys(), _pairwise(data_tuple[:(2 * len(X))])):
+        data['X_' + particle + '_train'], data['X_' + particle+ '_test'] = _scale(train, test)
 
-    return X_jets_train, X_jets_test, X_photons_train, X_photons_test, X_muons_train, X_muons_test, Y_train, Y_test, W_train, W_test
+    data['y_train'], data['y_test'], data['w_train'], data['w_test'] = data_tuple[-4:]
+
+    return data
 
 
 def padding(X, max_length, value=-999):
diff --git a/pipeline.py b/pipeline.py
index 4eb4e2c..2186752 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -8,7 +8,7 @@
 #from plotting import plot_inputs, plot_performance
 #from nn_model import train, test
 
-def main(json_config, exclude_vars):
+def main(json_config, tree_name):
     '''
     Args:
     -----
@@ -29,7 +29,7 @@ def main(json_config, exclude_vars):
                         ],
                         ...
                      }
-         exclude_vars: list of strings of names of branches not to be used for training   
+         tree_name:    
     Saves:
     ------
         'processed_data.h5': dictionary with processed ndarrays (X, y, w) for all particles for training and testing
@@ -38,7 +38,9 @@ def main(json_config, exclude_vars):
 
     # -- load in the JSON file
     logger.info('Loading JSON config')
-    class_files_dict = json.load(open(json_config))
+    config = utils.load_config(json_config)
+    class_files_dict = config['classes']
+    particles = config['particles']
 
     # -- hash the config dictionary to check if the pickled data exists
     from hashlib import md5
@@ -48,49 +50,31 @@ def sha(s):
         m.update(s.__repr__())
         return m.hexdigest()[:5]
 
-    # -- if the pickle exists, use it
+    #-- if the pickle exists, use it
     try:
         data = cPickle.load(open('processed_data_' + sha(class_files_dict) + '.pkl', 'rb'))
         logger.info('Preprocessed data found in pickle')
-        X_jets_train = data['X_jets_train']
-        X_jets_test = data['X_jets_test']
-        X_photons_train = data['X_photons_train']
-        X_photons_test = data['X_photons_test']
-        X_muons_train = data['X_muons_train']
-        X_muons_test = data['X_muons_test']
-        y_train = data['y_train']
-        y_test = data['y_test']
-        w_train = data['w_train']
-        w_test = data['w_test']
-        varlist = data['varlist']
-
-    # -- otherwise, process the new data
+
+    # -- otherwise, process the new data 
     except IOError:
         logger.info('Preprocessed data not found')
         logger.info('Processing data')
         # -- transform ROOT files into standard ML format (ndarrays) 
-        X_jets, X_photons, X_muons, y, w, varlist = read_in(class_files_dict, exclude_vars)
+        X, y, w = read_in(class_files_dict, tree_name, particles)
+
         # -- shuffle, split samples into train and test set, scale features
-        X_jets_train, X_jets_test, \
-        X_photons_train, X_photons_test, \
-        X_muons_train, X_muons_test, \
-        y_train, y_test, \
-        w_train, w_test = shuffle_split_scale(X_jets, X_photons, X_muons, y, w)
+        data = shuffle_split_scale(X, y, w) #X_muons, y, w)
+  
+        data.update({
+            'varlist' : [
+                branch 
+                for particle_info in particles.values() 
+                for branch in particle_info['branches']
+            ]
+        })
         # -- save out to pickle
         logger.info('Saving processed data to pickle')
-        cPickle.dump({
-            'X_jets_train' : X_jets_train,
-            'X_jets_test' : X_jets_test,
-            'X_photons_train' : X_photons_train,
-            'X_photons_test' : X_photons_test,
-            'X_muons_train' : X_muons_train,
-            'X_muons_test' : X_muons_test,
-            'y_train' : y_train,
-            'y_test' : y_test,
-            'w_train' : w_train,
-            'w_test' : w_test,
-            'varlist' : varlist
-            }, 
+        cPickle.dump(data, 
             open('processed_data_' + sha(class_files_dict) + '.pkl', 'wb'),
             protocol=cPickle.HIGHEST_PROTOCOL)
 
@@ -101,25 +85,12 @@ def sha(s):
     Plots should be saved out a pdf with informative names
     '''
     logger.info('Plotting input distributions')
-    plot_inputs(
-        X_jets_train, X_jets_test, 
-        X_photons_train, X_photons_test, 
-        X_muons_train, X_muons_test, 
-        y_train, y_test, 
-        w_train, w_test,
-        varlist 
-        )
-
-    X_jets_train, X_jets_test, \
-    X_photons_train, X_photons_test, \
-    X_muons_train, X_muons_test = map(padding, 
-        [
-            X_jets_train, X_jets_test, 
-            X_photons_train, X_photons_test, 
-            X_muons_train, X_muons_test
-        ],
-        [5, 5, 3, 3, 2, 2]
-    )
+    plot_inputs(data, particles.keys())
+
+    logger.info('Padding')
+    for key in data:
+        if key.startswith('X_'):
+            data[key] = padding(data[key], particles[key.split('_')[1]]['max_length'])
 
     # # -- train
     # # design a Keras NN with three RNN streams (jets, photons, muons)
@@ -147,8 +118,8 @@ def sha(s):
     # -- read in arguments
     parser = argparse.ArgumentParser()
     parser.add_argument('config', help="JSON file that specifies classes and corresponding ROOT files' paths")
-    parser.add_argument('--exclude', help="names of branches to exclude from training", nargs="*", default=[])
+    parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
     args = parser.parse_args()
 
     # -- pass arguments to main
-    sys.exit(main(args.config, args.exclude))
+    sys.exit(main(args.config, args.tree))
diff --git a/plotting.py b/plotting.py
index 92082a3..c177253 100644
--- a/plotting.py
+++ b/plotting.py
@@ -40,7 +40,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature):
 				min(min(flat_train), min(flat_test)), 
 				max(max(flat_train), max(flat_test)), 
 				30)
-			color = iter(cm.rainbow(np.linspace(0, 1, 2)))
+			color = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(y_train)))))
 			# -- loop through the classes
 			for k in range(len(np.unique(y_train))):
 				c = next(color)
@@ -48,7 +48,7 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature):
 					bins=bins, 
 					histtype='step', 
 					normed=True, 
-					label='Train - class: '+str(k),
+					label='Train - class: ' + str(k),
 					weights=w_train_ext[y_train_ext == k],
 					color=c, 
 					linewidth=1)
@@ -73,8 +73,10 @@ def _plot_X(train, test, y_train, y_test, w_train, w_test, varlist, feature):
 			#plt.show()
 			column_counter += 1
 
-def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, 
-	X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist):
+# def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test, 
+# 	y_train, y_test, w_train, w_test, varlist):
+def plot_inputs(data, particle_names):
+	#X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist):
 	'''
 	Args:
 		X_jets_train: ndarray [n_ev_train, n_jet_feat] containing the 
@@ -105,6 +107,16 @@ def plot_inputs(X_jets_train, X_jets_test, X_photons_train, X_photons_test,
 		sets of each class for each feature 
 	'''
 	
-	_plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'Jet')
-	_plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'Photon')
-	_plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon')
+	for particle in particle_names:
+		_plot_X(
+			data['X_' + particle + '_train'], 
+			data['X_' + particle + '_test'], 
+			data['y_train'],
+			data['y_test'], 
+			data['w_train'], 
+			data['w_test'], 
+			data['varlist'],
+			particle)
+	# _plot_X(X_jets_train, X_jets_test, y_train, y_test, w_train, w_test, varlist, 'jet')
+	# _plot_X(X_photons_train, X_photons_test, y_train, y_test, w_train, w_test, varlist, 'photon')
+	#_plot_X(X_muons_train, X_muons_test, y_train, y_test, w_train, w_test, varlist, 'Muon')
diff --git a/utils.py b/utils.py
index a61fc72..ca27c59 100644
--- a/utils.py
+++ b/utils.py
@@ -1,4 +1,5 @@
 import logging
+import json
 
 def configure_logging():
     rlogger = logging.getLogger()
@@ -7,4 +8,15 @@ def configure_logging():
     logging.addLevelName(logging.WARNING, "\033[1;31m{:8}\033[1;0m".format(logging.getLevelName(logging.WARNING)))
     logging.addLevelName(logging.ERROR, "\033[1;35m{:8}\033[1;0m".format(logging.getLevelName(logging.ERROR)))
     logging.addLevelName(logging.INFO, "\033[1;32m{:8}\033[1;0m".format(logging.getLevelName(logging.INFO)))
-    logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG)))
\ No newline at end of file
+    logging.addLevelName(logging.DEBUG, "\033[1;34m{:8}\033[1;0m".format(logging.getLevelName(logging.DEBUG)))
+
+def load_config(config_file):
+	# TO DO: make sure that particle names don't have underscores in them
+	config = json.load(open(config_file, 'r'))
+	required_keys = ['classes', 'particles']
+
+	for k in required_keys:
+		if k not in config.keys():
+			raise KeyError('pipeline configuration requires key: {}'.format(k))
+
+	return config
\ No newline at end of file