pipeline.py

import json
from data_processing import read_in, shuffle_split_scale, padding
import numpy as np
import pandautils as pup
import cPickle
from plotting import plot_inputs, plot_NN_class_placement_probability, save_roc_curves
import utils
import logging
from nn_all import train
import deepdish.io as io
#from plotting import plot_inputs, plot_performance
from functional_nn import train, test
from plotting import plot_inputs, plot_confusion, plot_regression#, plot_performance
from nn_with_modes import train, test 

def main(json_config, model_name, tree_name):
    '''
    Args:
    -----
        json_config: path to a JSON file, containing a dictionary that links the names of the different
                     classes in the classification problem to the paths of the ROOT files
                     associated with each class; for example:

                     {
                        "ttbar" :
                        [
                            "/path/to/file1.root",
                            "/path/to/file2.root",
                        ],
                        "qcd" :
                        [
                            "/path/to/file3.root",
                            "/path/to/file4.root",
                        ],
                        ...
                     }
         tree_name: string, name of the tree that contains the correct branches
    Saves:
    ------
        'processed_data_<hash>.pkl': dictionary with processed ndarrays (X, y, w) for all particles for training and testing
    '''
    logger = logging.getLogger('Main')

    # -- load in the JSON file
    logger.info('Loading information from ' + json_config)
    config = utils.load_config(json_config)
    class_files_dict = config['classes']
    particles_dict = config['particles']

    # -- hash the config dictionary to check if the pickled data exists
    from hashlib import md5
    def sha(s):
        '''Get a unique identifier for an object'''
        m = md5()
        m.update(s.__repr__())
        return m.hexdigest()[:5]

    #-- if the pickle exists, use it
    pickle_name = 'processed_data_' + sha(config) + '_' + sha(mode) + '.pkl'
    try:
        logger.info('Attempting to read from {}'.format(pickle_name))
        data = cPickle.load(open(pickle_name, 'rb'))
        logger.info('Pre-processed data found and loaded from pickle') 
    # -- otherwise, process the new data 
    except IOError:
        logger.info('Pre-processed data not found in {}'.format(pickle_name))
        logger.info('Processing data')
        # -- transform ROOT files into standard ML format (ndarrays) 
        X, y, w, le = read_in(class_files_dict, tree_name, particles_dict, mode)

        # -- shuffle, split samples into train and test set, scale features
        data = shuffle_split_scale(X, y, w)  

        data.update({
            'varlist' : [
                branch 
                for particle_info in particles_dict.values() 
                for branch in particle_info['branches']
            ],
            'LabelEncoder' : le
        })
        # -- plot distributions:
        '''
        This should produce normed, weighted histograms of the input distributions for all variables
        The train and test distributions should be shown for every class
        Plots should be saved out a pdf with informative names
        '''
        logger.info('Saving input distributions in ./plots/')
        plot_inputs(data, particles_dict)

        logger.info('Padding')
        for key in data:
            if ((key.startswith('X_')) and ('event' not in key)): # no padding for `event` matrix
                data[key] = padding(data[key], particles_dict[key.split('_')[1]]['max_length']) 
                # ^ assuming naming convention: X_<particle>_train, X_<particle>_test 

        # -- save out to pickle
        logger.info('Saving processed data to {}'.format(pickle_name))
        cPickle.dump(data, 
            open(pickle_name, 'wb'),
            protocol=cPickle.HIGHEST_PROTOCOL)

    # -- plot distributions:

    # # -- train
    # # design a Keras NN with three RNN streams (jets, photons, muons)
    # # combine the outputs and process them through a bunch of FF layers
    # # use a validation split of 20%
    # # save out the weights to hdf5 and the model to yaml
    net=NN_train(data, model_name)
    print data['X_electron_test']
    print data['X_muon_test']
   
    #yhat=net.predict([data['X_jet_test'], data['X_photon_test'], data], verbose = True, batch_size = 512) 
  
    # # -- plot performance by mode
    if mode == 'regression':
        plot_regression(yhat, data)
    if mode == 'classification':
        plot_confusion(yhat, data)

>>>>>>> 292a0d3357b00850bae9b432db74fd75ca906248
    # # -- plot performance
    #plot_NN(yhat, data)

    # # produce ROC curves to evaluate performance
    save_roc_curves(yhat, data, model_name)
    # # save them out to pdf
    # plot_performance(yhat, data['y_test'], data['w_test'])

if __name__ == '__main__':
    
    import sys
    import argparse

    utils.configure_logging()

    # -- read in arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('config', help="path to JSON file that specifies classes and corresponding ROOT files' paths")
<<<<<<< HEAD
    parser.add_argument('model_name', help="name of the set from particular network")
||||||| merged common ancestors
=======
    parser.add_argument('mode', help="classification or regression")
>>>>>>> 292a0d3357b00850bae9b432db74fd75ca906248
    parser.add_argument('--tree', help="name of the tree to open in the ntuples", default='mini')
    args = parser.parse_args()

    if args.mode != 'classification' and args.mode != 'regression':
        raise ValueError('Mode must be classification or regression')

    # -- pass arguments to main
<<<<<<< HEAD
    sys.exit(main(args.config, args.model_name, args.tree))
||||||| merged common ancestors
    sys.exit(main(args.config, args.tree))
=======
    sys.exit(main(args.config, args.mode, args.tree))
>>>>>>> 292a0d3357b00850bae9b432db74fd75ca906248